def test_arbitrary_rankings(self): """test that _select_best_with_multiple_criteria can sort with mutliple criteria and get the right answer""" a = (0,1) b = (1,1) c = (1,2) d = (2,1) e = (2,2) f = (0,4) g = (-1,5) input = [a,b,c,d,e,f,g] #sort by left minimum, right minimum result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input, [(lambda x: x[0], min),(lambda x: x[1], min)]) self.assertEqual(result[0], g) #sort by right minimum, left minimum result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input,[(lambda x: x[1], min),(lambda x: x[0], min)]) self.assertEqual(result[0], a) #sort by left maximum, right minimum result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input, [(lambda x: x[0], max),(lambda x: x[0], min)]) self.assertEqual(result[0], d) #sort by sum, then right maximum result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input,[(sum, max), (lambda x: x[1],max)]) self.assertEqual(result[0], g)
def test_intitialize(self): """Test a simple initialization of an ensembl datasource """ base_config_location = "testdata/ensembl/saccer/" config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config") title = config_parser.get("general", "title") version = config_parser.get("general", "version") src_file = config_parser.get("general", "src_file") ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file) self.assertIsNotNone(ensembl_ds) ensembl_ds.set_tx_mode(TranscriptProvider.TX_MODE_BEST_EFFECT) self.assertTrue(TranscriptProvider.TX_MODE_BEST_EFFECT == ensembl_ds.get_tx_mode())
def _create_test_gencode_ds(base_output_filename, protein_id_mapping_file, gencode_version): genes = [ "MAPK1", "MUC16", "PIK3CA", "YPEL1", "KRTAP4-7", "MAT2A", "DDX11L10" ] gtf_list = [] fasta_list = [] for gene in genes: gtf_list.append("testdata/gencode/" + gene + ".gencode.v" + str(gencode_version) + ".annotation.gtf") fasta_list.append("testdata/gencode/" + gene + ".gencode.v" + str(gencode_version) + ".pc_transcripts.fa") shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices( gtf_list, fasta_list, base_output_filename, protein_id_mapping_file=protein_id_mapping_file) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="GENCODE", version="v" + str(gencode_version), tx_filter="basic") return ensembl_ds
def createTranscriptProviderDatasource(config, tx_mode="CANONICAL", protocol="file"): """ Creates a GENCODE or Gaf 3.0 datasource from a config file. Determines which is available automatically, For GAF 3.0, assumes a gaf3.0 section with keys: gaf_fname and gaf_transcript_seqs_fname """ if os.path.exists(config.get("gencode", "gencodeDir")): gencode_dir = config.get("gencode", "gencodeDir") result_ds = EnsemblTranscriptDatasource( gencode_dir + "/gencode.v19.annotation.gtf", title="GENCODE", version="TEST v19", tx_filter="basic", tx_mode=tx_mode) else: try: gaf_fname = config.get("gaf3.0", "gaf_fname") gaf_transcripts_fname = config.get( "gaf3.0", "gaf_transcript_seqs_fname") result_ds = Gaf(gaf_fname, gaf_transcripts_fname, tx_mode=tx_mode, protocol=protocol) except Exception as gaf_failure_reason: raise Exception( "Couldn't create a transcript provider datasource, no gencode dir found and %s" % gaf_failure_reason) return result_ds
def test_convert_genomic_space_to_exon_space(self, loc, gt_d): """Test genomic --> exon transform on real data. """ gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf" gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa" base_output_filename = "out/test_variant_classification" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, version="TEST") tx = ensembl_ds.get_overlapping_transcripts("22", "22108790", "22108790") start, end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(loc[0], loc[1], tx[0]) loc_length = (int(loc[1]) - int(loc[0])) self.assertTrue((end - start) == loc_length, str(end) + " - " + str(start) + " was not correct length: " + str(loc_length)) self.assertTrue(start == gt_d, "start position (" + str(start) + ") did not match gt (" + str(end) + ")" + " exons: " + str(tx[0].get_exons()))
def test_simple_annotate(self): """ Annotate a simple example. """ base_config_location = "testdata/ensembl/saccer/" config_parser = ConfigUtils.createConfigParser(base_config_location + "ensembl.config") title = config_parser.get("general", "title") version = config_parser.get("general", "version") src_file = config_parser.get("general", "src_file") ensembl_ds = EnsemblTranscriptDatasource(title=title, version=version, src_file=src_file) m = MutationData() m.chr = "22" m.start = "22161963" m.end = "22161963" m.ref_allele = "C" m.alt_allele = "A" m2 = ensembl_ds.annotate_mutation(m)
def test_tie_breaking_rankings(self): """test that _select_best_with_multiple_criteria works with ties""" a = (0, 0, 1) b = (0, 0, 2) c = (0, 0, 3) input = [a, b, c] result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria( input, [(lambda x: x[0], max), (lambda x: 3, min), (lambda x: x[1], max), (lambda x: x[2], max)]) self.assertEqual(result[0], c)
def _create_ensembl_ds_from_saccer(self): gencode_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" gencode_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" base_output_filename = "out/test_saccer_ds" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="ensembl", version="71") return ensembl_ds
def test_tie_breaking_rankings(self): """test that _select_best_with_multiple_criteria works with ties""" a = (0,0,1) b = (0,0,2) c = (0,0,3) input =[a,b,c] result = EnsemblTranscriptDatasource._select_best_with_multiple_criteria(input, [(lambda x: x[0], max), (lambda x: 3,min), (lambda x: x[1], max), (lambda x: x[2],max)]) self.assertEqual(result[0],c)
def createDatasourceFromConfigParser(configParser, leafDir): """ configParser -- config parser instance from the config file in the leafdir. For information on config file format/conventions see (TODO) leafDir -- contains the file and necessary files (post indexing and install steps) to instantiate a datasource. """ result = None # Determine the type dsType = configParser.get("general", "type") # TODO: Replace these if statements with something a bit more robust, such as a proper dependency injection framework filePrefix = leafDir + "/" if dsType == "gaf": gaf_fname = filePrefix + configParser.get('general', 'gaf_fname') gaf_transcript_sequences_fname = filePrefix + configParser.get('general', 'gaf_transcript_seqs_fname') result = Gaf(gaf_fname, gaf_transcript_sequences_fname, title=configParser.get("general", "title"), version=configParser.get("general", "version"), protocol=configParser.get("general", "protocol")) elif dsType == "dbsnp": result = dbSNP(filePrefix + configParser.get('general', 'src_file'), title=configParser.get('general', 'title'), version=configParser.get('general', 'version')) elif dsType == "ensembl": result = EnsemblTranscriptDatasource(filePrefix + configParser.get('general', 'src_file'), title=configParser.get('general', 'title'), version=configParser.get('general', 'version'), tx_filter=configParser.get('general', 'transcript_filter')) elif dsType == "cosmic": result = Cosmic(src_file=filePrefix + configParser.get('general', 'src_file'), version=configParser.get('general', 'version'), gpp_tabix_file=filePrefix + configParser.get('general', 'gpp_src_file')) elif dsType == 'ref': if configParser.has_option('general', 'windowSizeRef'): window_size = configParser.get('general', 'windowSizeRef') else: window_size = 10 result = ReferenceDatasource(filePrefix, title=configParser.get("general", "title"), version=configParser.get('general', 'version'), windowSizeRef=window_size) elif dsType == 'gene_tsv': result = GenericGeneDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'gene_col')) elif dsType == 'transcript_tsv': result = GenericTranscriptDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'transcript_col')) elif dsType == 'vc_tsv': result = GenericVariantClassificationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'vc_col')) elif dsType == 'gp_tsv': result = GenericGenomicPositionDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols')) elif dsType == 'gm_tsv': result = GenericGenomicMutationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols')) elif dsType == 'gm_tsv_reverse_complement': result = GenericGenomicMutationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols'), use_complementary_strand_alleles_for_negative_strand_transcripts=True) elif dsType == 'gpp_tsv': result = GenericGeneProteinPositionDatasource(src_file=filePrefix + configParser.get('general', 'src_file'),title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'gene_protein_position_cols')) elif dsType == "transcript_to_uniprot_aa": result = TranscriptToUniProtProteinPositionTransformingDatasource(title=configParser.get("general", "title"), version=configParser.get('general', 'version'), src_file="file://" + filePrefix + configParser.get('general', 'src_file'), # three slashes for sqlite inputPositionAnnotationName=configParser.get('general', 'inputPositionAnnotationName'), outputPositionAnnotationName=configParser.get('general','outputPositionAnnotationName')) elif dsType == "mock_exception": result = MockExceptionThrowingDatasource(title=configParser.get("general", "title"), version=configParser.get('general', 'version')) elif dsType == "indexed_vcf": result = IndexedVcfDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), match_mode=configParser.get('general', 'match_mode')) elif dsType == "indexed_tsv": columnNames = configParser.get("general", "column_names") columnNames = columnNames.split(",") annotationColumnNames = configParser.get("general", "annotation_column_names") annotationColumnNames = annotationColumnNames.split(",") indexColumnNames = configParser.get("general", "index_column_names") indexColumnNames = indexColumnNames.split(",") DatasourceFactory._log_missing_column_name_msg(columnNames, annotationColumnNames) columnDataTypes = dict() for columnName in annotationColumnNames: if columnName.strip() == "": continue columnDataTypes[columnName] = configParser.get("data_types", columnName) result = IndexedTsvDatasource(src_file=filePrefix + configParser.get("general", "src_file"), title=configParser.get("general", "title"), version=configParser.get("general", "version"), colNames=columnNames, annotationColNames=annotationColumnNames, indexColNames=indexColumnNames, match_mode=configParser.get("general", "match_mode"), colDataTypes=columnDataTypes) elif dsType == 'bigwig': if not NGSLIB_INSTALLED: raise RuntimeError("Bigwig datasource found in db-dir but ngslib library not installed.") result = BigWigDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version')) else: raise RuntimeError('Unknown datasource type: %s' % dsType) hashcode = DatasourceFactory._retrieve_hash_code(leafDir) result.set_hashcode(hashcode) return result