def testExtentOutOfRangeError(self): ''' If a window is specified that extends beyond the beginning or end of a file, truncate the ref_context. Use what is left for gc_content as well.''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeRef=6, windowSizeGCContent=5) m = MutationDataFactory.default_create() m.chr = "22" m.start = "4" m.end = "4" # "CCCAAGCTAAACCCAGGCCAC" groundTruth = "CCCAAGCTAA" guess = ds.annotate_mutation(m) self.assertTrue( guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue( fabs(float(guess['gc_content']) - (float(5) / float(9))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
def testEmptyAnswer(self): ''' The Reference Datasource should return a blank result if the chromosome is not found. Note: A log entry should also be written, but this is not tested. ''' self.logger.info("Please ignore the next logging warning: testdata/reference_ds/chrTHIS_DOES_NOT_EXIST.txt not found. Please add it.") ds = ReferenceDatasource('testdata/reference_ds') m = MutationDataFactory.default_create() m.chr = "THIS_DOES_NOT_EXIST" m.start = "11" m.end = "11" groundTruth = "" # remember that the annotate_mutation returns a generator, so we use an iterator guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly -- should be blank: " + str(guess['ref_context']))
def testEmptyAnswer(self): ''' The Reference Datasource should return a blank result if the chromosome is not found. Note: A log entry should also be written, but this is not tested. ''' self.logger.info("Please ignore the next logging warning: testdata/reference_ds/chrTHIS_DOES_NOT_EXIST.txt not found. Please add it.") ds = ReferenceDatasource('testdata/reference_ds') m = MutationData() m.chr = "THIS_DOES_NOT_EXIST" m.start = "11" m.end = "11" groundTruth = "" # remember that the annotate_mutation returns a generator, so we use an iterator guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly -- should be blank: " + str(guess['ref_context']))
def testSimpleGLAnnotate(self): ''' Test a simple annotation case. Make sure that the ref_context and gc_content annotations are correct. ''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5) m = MutationDataFactory.default_create() m.chr = "GL000211.1" m.start = "11" m.end = "11" groundTruth = "gaattctttttcaagtaagtc" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content']) - (float(3)/float(11))) < .001, "gc_content was not populated properly: " + str(m['gc_content']))
def testSimpleGLAnnotate(self): ''' Test a simple annotation case. Make sure that the ref_context and gc_content annotations are correct. ''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5) m = MutationData() m.chr = "GL000211.1" m.start = "11" m.end = "11" groundTruth = "gaattctttttcaagtaagtc" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(m['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content']) - (float(3)/float(11))) < .001, "gc_content was not populated properly: " + str(m['gc_content']))
def testSimpleAnnotate(self): ''' Perform a simple test of one of the aligned chromosomes (chr22) and make sure that we get a reasonable answer. ''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5) m = MutationDataFactory.default_create() m.chr = "22" m.start = "11" m.end = "11" groundTruth = "CCCAAGCTAAACCCAGGCCAC" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content'])- (float(6)/float(11))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
def testSimpleAnnotate(self): ''' Perform a simple test of one of the aligned chromosomes (chr22) and make sure that we get a reasonable answer. ''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeGCContent=5) m = MutationData() m.chr = "22" m.start = "11" m.end = "11" groundTruth = "CCCAAGCTAAACCCAGGCCAC" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content'])- (float(6)/float(11))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
def testExtentOutOfRangeError(self): ''' If a window is specified that extends beyond the beginning or end of a file, truncate the ref_context. Use what is left for gc_content as well.''' ds = ReferenceDatasource('testdata/reference_ds', windowSizeRef=6, windowSizeGCContent=5) m = MutationDataFactory.default_create() m.chr = "22" m.start = "4" m.end = "4" # "CCCAAGCTAAACCCAGGCCAC" groundTruth = "CCCAAGCTAA" guess = ds.annotate_mutation(m) self.assertTrue(guess['ref_context'] == groundTruth, "ref_context was not populated properly: " + str(guess['ref_context'])) # gc_content is rounded to 3 decimal places self.assertTrue(fabs(float(guess['gc_content']) - (float(5)/float(9))) < .001, "gc_content was not populated properly: " + str(guess['gc_content']))
def testFilenameDetermination(self): ''' Test that proper conversions are being done for chromosome to flat filename ''' ds = ReferenceDatasource('testdata/reference_ds') self.assertTrue( ds.convertMutationChrToFilename("GL000211.1") == 'chrUn_gl000211.txt', "Did not find GL file: " + str(ds.convertMutationChrToFilename("GL000211.1"))) self.assertTrue( ds.convertMutationChrToFilename("X") == 'chrX.txt', "Did not find chrX file: " + str(ds.convertMutationChrToFilename("X"))) self.assertTrue( ds.convertMutationChrToFilename("GL000209.1") == 'chr19_gl000209_random.txt', "Did not find GL chr19 file: " + str(ds.convertMutationChrToFilename("GL000209.1")))
def testFilenameDetermination(self): ''' Test that proper conversions are being done for chromosome to flat filename ''' ds = ReferenceDatasource('testdata/reference_ds') self.assertTrue(ds.convertMutationChrToFilename("GL000211.1") == 'chrUn_gl000211.txt', "Did not find GL file: " + str(ds.convertMutationChrToFilename("GL000211.1"))) self.assertTrue(ds.convertMutationChrToFilename("X") == 'chrX.txt', "Did not find chrX file: " + str(ds.convertMutationChrToFilename("X"))) self.assertTrue(ds.convertMutationChrToFilename("GL000209.1") == 'chr19_gl000209_random.txt', "Did not find GL chr19 file: " + str(ds.convertMutationChrToFilename("GL000209.1")))
def createReferenceDatasource(config): refFilename = config.get("ref_hg", "refDir") return ReferenceDatasource(refFilename)
def createDatasourceFromConfigParser(configParser, leafDir): """ configParser -- config parser instance from the config file in the leafdir. For information on config file format/conventions see (TODO) leafDir -- contains the file and necessary files (post indexing and install steps) to instantiate a datasource. """ result = None # Determine the type dsType = configParser.get("general", "type") # TODO: Replace these if statements with something a bit more robust, such as a proper dependency injection framework filePrefix = leafDir + "/" if dsType == "gaf": gaf_fname = filePrefix + configParser.get('general', 'gaf_fname') gaf_transcript_sequences_fname = filePrefix + configParser.get('general', 'gaf_transcript_seqs_fname') result = Gaf(gaf_fname, gaf_transcript_sequences_fname, title=configParser.get("general", "title"), version=configParser.get("general", "version"), protocol=configParser.get("general", "protocol")) elif dsType == "dbsnp": result = dbSNP(filePrefix + configParser.get('general', 'src_file'), title=configParser.get('general', 'title'), version=configParser.get('general', 'version')) elif dsType == "ensembl": result = EnsemblTranscriptDatasource(filePrefix + configParser.get('general', 'src_file'), title=configParser.get('general', 'title'), version=configParser.get('general', 'version'), tx_filter=configParser.get('general', 'transcript_filter')) elif dsType == "cosmic": result = Cosmic(src_file=filePrefix + configParser.get('general', 'src_file'), version=configParser.get('general', 'version'), gpp_tabix_file=filePrefix + configParser.get('general', 'gpp_src_file')) elif dsType == 'ref': if configParser.has_option('general', 'windowSizeRef'): window_size = configParser.get('general', 'windowSizeRef') else: window_size = 10 result = ReferenceDatasource(filePrefix, title=configParser.get("general", "title"), version=configParser.get('general', 'version'), windowSizeRef=window_size) elif dsType == 'gene_tsv': result = GenericGeneDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'gene_col')) elif dsType == 'transcript_tsv': result = GenericTranscriptDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'transcript_col')) elif dsType == 'vc_tsv': result = GenericVariantClassificationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), geneColumnName=configParser.get('general', 'vc_col')) elif dsType == 'gp_tsv': result = GenericGenomicPositionDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols')) elif dsType == 'gm_tsv': result = GenericGenomicMutationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols')) elif dsType == 'gm_tsv_reverse_complement': result = GenericGenomicMutationDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'genomic_position_cols'), use_complementary_strand_alleles_for_negative_strand_transcripts=True) elif dsType == 'gpp_tsv': result = GenericGeneProteinPositionDatasource(src_file=filePrefix + configParser.get('general', 'src_file'),title=configParser.get("general", "title"), version=configParser.get('general', 'version'), gpColumnNames=configParser.get('general', 'gene_protein_position_cols')) elif dsType == "transcript_to_uniprot_aa": result = TranscriptToUniProtProteinPositionTransformingDatasource(title=configParser.get("general", "title"), version=configParser.get('general', 'version'), src_file="file://" + filePrefix + configParser.get('general', 'src_file'), # three slashes for sqlite inputPositionAnnotationName=configParser.get('general', 'inputPositionAnnotationName'), outputPositionAnnotationName=configParser.get('general','outputPositionAnnotationName')) elif dsType == "mock_exception": result = MockExceptionThrowingDatasource(title=configParser.get("general", "title"), version=configParser.get('general', 'version')) elif dsType == "indexed_vcf": result = IndexedVcfDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version'), match_mode=configParser.get('general', 'match_mode')) elif dsType == "indexed_tsv": columnNames = configParser.get("general", "column_names") columnNames = columnNames.split(",") annotationColumnNames = configParser.get("general", "annotation_column_names") annotationColumnNames = annotationColumnNames.split(",") indexColumnNames = configParser.get("general", "index_column_names") indexColumnNames = indexColumnNames.split(",") DatasourceFactory._log_missing_column_name_msg(columnNames, annotationColumnNames) columnDataTypes = dict() for columnName in annotationColumnNames: if columnName.strip() == "": continue columnDataTypes[columnName] = configParser.get("data_types", columnName) result = IndexedTsvDatasource(src_file=filePrefix + configParser.get("general", "src_file"), title=configParser.get("general", "title"), version=configParser.get("general", "version"), colNames=columnNames, annotationColNames=annotationColumnNames, indexColNames=indexColumnNames, match_mode=configParser.get("general", "match_mode"), colDataTypes=columnDataTypes) elif dsType == 'bigwig': if not NGSLIB_INSTALLED: raise RuntimeError("Bigwig datasource found in db-dir but ngslib library not installed.") result = BigWigDatasource(src_file=filePrefix + configParser.get('general', 'src_file'), title=configParser.get("general", "title"), version=configParser.get('general', 'version')) else: raise RuntimeError('Unknown datasource type: %s' % dsType) hashcode = DatasourceFactory._retrieve_hash_code(leafDir) result.set_hashcode(hashcode) return result