def testRightVersion(self): repo = datarepo.SqlDataRepository(self._repoPath) repo.open(datarepo.MODE_WRITE) repo.initialise() anotherRepo = datarepo.SqlDataRepository(self._repoPath) anotherRepo.open(datarepo.MODE_READ) self.assertEquals(anotherRepo._schemaVersion, str(repo.version))
def testWrongVersion(self): repo = datarepo.SqlDataRepository(self._repoPath) repo.version = datarepo.SqlDataRepository.SchemaVersion( "wrong.version") repo.open(datarepo.MODE_WRITE) repo.initialise() anotherRepo = datarepo.SqlDataRepository(self._repoPath) with self.assertRaises(exceptions.RepoSchemaVersionMismatchException): anotherRepo.open(datarepo.MODE_READ)
def setUp(self): dataRepository = datarepo.SqlDataRepository(paths.testDataRepo) dataRepository.open(datarepo.MODE_READ) self.backend = backend.Backend(dataRepository) self.dataset = self.backend.getDataRepository().getDatasets()[0] self.dataset_id = self.dataset.getId() self.access_map = {self.dataset.getLocalId(): 4}
def setUp(self): self._maxDiff = None repoPath = paths.testDataRepo self._dataUrl = moduleTestServer.getUrl() dataRepository = datarepo.SqlDataRepository(repoPath) dataRepository.open(datarepo.MODE_READ) self._backend = backend.Backend(dataRepository) self._client = client.LocalClient(self._backend)
def __init__(self, rnaQuantificationLocalId, baseDir): self._dataset = datasets.Dataset(_datasetName) self._repo = datarepo.SqlDataRepository(paths.testDataRepo) self._repo.open(datarepo.MODE_READ) self._referenceSet = references.AbstractReferenceSet("test_rs") rnaQuantificationId = rnaQuantificationLocalId[:-3] # remove '.db' super(RnaQuantificationTest, self).__init__(rnaQuantificationId, baseDir)
def _configure_backend(app): """A helper function used just to help modularize the code a bit.""" # Allocate the backend # We use URLs to specify the backend. Currently we have file:// URLs (or # URLs with no scheme) for the SqlDataRepository, and special empty:// and # simulated:// URLs for empty or simulated data sources. dataSource = urlparse.urlparse(app.config["DATA_SOURCE"], "file") if dataSource.scheme == "simulated": # Ignore the query string randomSeed = app.config["SIMULATED_BACKEND_RANDOM_SEED"] numCalls = app.config["SIMULATED_BACKEND_NUM_CALLS"] variantDensity = app.config["SIMULATED_BACKEND_VARIANT_DENSITY"] numVariantSets = app.config["SIMULATED_BACKEND_NUM_VARIANT_SETS"] numReferenceSets = app.config["SIMULATED_BACKEND_NUM_REFERENCE_SETS"] numReferencesPerReferenceSet = app.config[ "SIMULATED_BACKEND_NUM_REFERENCES_PER_REFERENCE_SET"] numAlignmentsPerReadGroup = app.config[ "SIMULATED_BACKEND_NUM_ALIGNMENTS_PER_READ_GROUP"] numReadGroupsPerReadGroupSet = app.config[ "SIMULATED_BACKEND_NUM_READ_GROUPS_PER_READ_GROUP_SET"] numPhenotypeAssociations = app.config[ "SIMULATED_BACKEND_NUM_PHENOTYPE_ASSOCIATIONS"] numPhenotypeAssociationSets = app.config[ "SIMULATED_BACKEND_NUM_PHENOTYPE_ASSOCIATION_SETS"] numRnaQuantSets = app.config[ "SIMULATED_BACKEND_NUM_RNA_QUANTIFICATION_SETS"] numExpressionLevels = app.config[ "SIMULATED_BACKEND_NUM_EXPRESSION_LEVELS_PER_RNA_QUANT_SET"] dataRepository = datarepo.SimulatedDataRepository( randomSeed=randomSeed, numCalls=numCalls, variantDensity=variantDensity, numVariantSets=numVariantSets, numReferenceSets=numReferenceSets, numReferencesPerReferenceSet=numReferencesPerReferenceSet, numReadGroupsPerReadGroupSet=numReadGroupsPerReadGroupSet, numAlignments=numAlignmentsPerReadGroup, numPhenotypeAssociations=numPhenotypeAssociations, numPhenotypeAssociationSets=numPhenotypeAssociationSets, numRnaQuantSets=numRnaQuantSets, numExpressionLevels=numExpressionLevels) elif dataSource.scheme == "empty": dataRepository = datarepo.EmptyDataRepository() elif dataSource.scheme == "file": path = os.path.join(dataSource.netloc, dataSource.path) dataRepository = datarepo.SqlDataRepository(path) dataRepository.open(datarepo.MODE_READ) else: raise exceptions.ConfigurationException( "Unsupported data source scheme: " + dataSource.scheme) theBackend = backend.Backend(dataRepository) theBackend.setRequestValidation(app.config["REQUEST_VALIDATION"]) theBackend.setDefaultPageSize(app.config["DEFAULT_PAGE_SIZE"]) theBackend.setMaxResponseLength(app.config["MAX_RESPONSE_LENGTH"]) return theBackend
def _createContinuousSet(self): """ Creates a ContinuousSet from the specified directory. """ self._continuousSetName = "testContinuous" self._repo = datarepo.SqlDataRepository(paths.testDataRepo) self._repo.open(datarepo.MODE_READ) self._dataset = datasets.Dataset("testDs") self._continuousSet = continuous.readSet( self._dataset, self._continuousSetName)
def __init__(self, featureSetLocalName, dataPath): """ :param localId: Name of the GFF3 resource corresponding to a pair of files, .db and .gff3 :param dataPath: string representing full path to the .db file :return: """ self._dataset = datasets.Dataset(_datasetName) self._repo = datarepo.SqlDataRepository(paths.testDataRepo) self._repo.open(datarepo.MODE_READ) self._ontology = self._repo.getOntologyByName(paths.ontologyName) self._referenceSet = references.AbstractReferenceSet("test_rs") featureSetLocalName = featureSetLocalName[:-3] # remove '.db' self._testData = _testDataForFeatureSetName[featureSetLocalName] super(FeatureSetTests, self).__init__(featureSetLocalName, dataPath)
def _createVariantAnnotationSet(self, vcfDir): """ Creates a VariantAnnotationSet from the specified directory of VCF files. """ self._variantSetName = "testVariantSet" self._repo = datarepo.SqlDataRepository(paths.testDataRepo) self._repo.open(datarepo.MODE_READ) self._dataset = datasets.Dataset("testDs") self._variantSet = variants.HtslibVariantSet( self._dataset, self._variantSetName) self._variantSet.populateFromDirectory(vcfDir) self._variantAnnotationSet = variants.HtslibVariantAnnotationSet( self._variantSet, "testVAs") self._variantAnnotationSet.setOntology( self._repo.getOntologyByName(paths.ontologyName))
def __init__(self, inputDirectory, outputDirectory, force): """ Converts human readable dataset from compliance repository, and translates it into a reference-server readable filesystem with binary files. :param inputDirectory: location of the human readable compliance dataset :param outputDirectory: location of the file hierarchy suitable for deploying on the reference server """ self.inputDirectory = inputDirectory self.outputDirectory = outputDirectory self.repoPath = os.path.abspath( os.path.join(outputDirectory, "registry.db")) self.tempdir = None if os.path.exists(self.outputDirectory): if force: utils.log( "Removing existing output directory at '{}'".format( self.outputDirectory)) shutil.rmtree(self.outputDirectory) else: utils.log( "Output directory '{}' already exists".format( self.outputDirectory)) utils.log( "Please specify an output path that does not exist") utils.log("Exiting...") exit(1) # If no input directory is specified download from GitHub if inputDirectory is None: utils.log("Downloading test data...") self.tempdir = tempfile.mkdtemp() assert(os.path.exists(self.tempdir)) url = "https://github.com/ga4gh/compliance/archive/master.zip" filePath = os.path.join(self.tempdir, 'compliance-master.zip') downloader = file_downloader.HttpFileDownloader(url, filePath) downloader.download() utils.log("Extracting test data...") with zipfile.ZipFile(filePath, "r") as z: z.extractall(self.tempdir) self.inputDirectory = os.path.join( self.tempdir, 'compliance-master', 'test-data') repo = datarepo.SqlDataRepository(self.repoPath) self.repo = repo
def createRepo(self): """ Creates the repository for all the data we've just downloaded. """ repo = datarepo.SqlDataRepository(self.repoPath) repo.open("w") repo.initialise() referenceSet = references.HtslibReferenceSet("GRCh37-subset") referenceSet.populateFromFile(self.fastaFilePath) referenceSet.setDescription("Subset of GRCh37 used for demonstration") referenceSet.setSpeciesFromJson( '{"id": "9606",' + '"term": "H**o sapiens", "source_name": "NCBI"}') for reference in referenceSet.getReferences(): reference.setSpeciesFromJson( '{"id": "9606",' + '"term": "H**o sapiens", "source_name": "NCBI"}') reference.setSourceAccessions( self.accessions[reference.getName()] + ".subset") repo.insertReferenceSet(referenceSet) dataset = datasets.Dataset("1kg-p3-subset") dataset.setDescription("Sample data from 1000 Genomes phase 3") repo.insertDataset(dataset) variantSet = variants.HtslibVariantSet(dataset, "mvncall") variantSet.setReferenceSet(referenceSet) dataUrls = [vcfFile for vcfFile, _ in self.vcfFilePaths] indexFiles = [indexFile for _, indexFile in self.vcfFilePaths] variantSet.populateFromFile(dataUrls, indexFiles) variantSet.checkConsistency() repo.insertVariantSet(variantSet) for sample, (bamFile, indexFile) in zip(self.samples, self.bamFilePaths): readGroupSet = reads.HtslibReadGroupSet(dataset, sample) readGroupSet.populateFromFile(bamFile, indexFile) readGroupSet.setReferenceSet(referenceSet) repo.insertReadGroupSet(readGroupSet) repo.commit() repo.close() self.log("Finished creating the repository; summary:\n") repo.open("r") repo.printSummary()
def testTextFile(self): with open(self._repoPath, 'w') as textFile: textFile.write('This is now a text file') repo = datarepo.SqlDataRepository(self._repoPath) with self.assertRaises(exceptions.RepoInvalidDatabaseException): repo.open(datarepo.MODE_READ)
def testDbFileWithoutTables(self): repo = datarepo.SqlDataRepository(self._repoPath) with self.assertRaises(exceptions.RepoInvalidDatabaseException): repo.open(datarepo.MODE_READ)
def main(): # Set for using hg38 rather than hg19 # reference_set_path = '/mnt/ga4gh/repo_data/hg38.fa.gz' reference_set_path = '/mnt/ga4gh/repo_data/hs37d5.fa.gz' bio_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.Biosample.tsv' ind_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.individual.tsv' bio_samples = parse_file_biosamples(bio_tsv_location) individuals = parse_file_individuals(ind_tsv_location) repoPath = os.path.join("repo2.db") repo = datarepo.SqlDataRepository(repoPath) if (os.path.isfile("repo2.db") == True): os.system("rm repo2.db") repo.open("w") repo.initialise() dataset = datasets.Dataset("Simons") dataset.setDescription( "Variants from the Simons Foundation Genome Diversity Project") repo.insertDataset(dataset) print("Inserting biosamples") new_bio_samples = [] for bio_sample in bio_samples: new_bio_sample = biodata.Biosample( dataset, unicode(bio_sample['name'], errors='replace')) new_bio_sample.populateFromJson(json.dumps(bio_sample)) repo.insertBiosample(new_bio_sample) new_bio_samples.append(new_bio_sample) print("Inserting individuals") new_individuals = [] for individual in individuals: new_individual = biodata.Individual( dataset, unicode(individual['name'], errors='replace')) new_individual.populateFromJson(json.dumps(individual)) repo.insertIndividual(new_individual) new_individuals.append(new_individual) print("Adding reference set (takes a while)") reference_set = references.HtslibReferenceSet("NCBI37") reference_set.populateFromFile(reference_set_path) reference_set.setDescription("NCBI37 assembly of the human genome") reference_set.setNcbiTaxonId(9606) reference_set.setSourceUri( "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz" ) for reference in reference_set.getReferences(): reference.setNcbiTaxonId(9606) repo.insertReferenceSet(reference_set) seq_ontology = ontologies.Ontology("/mnt/ga4gh/repo_data/so-xp") ontology_file_path = '/mnt/ga4gh/repo_data/so-xp-simple.obo' seq_ontology.populateFromFile(ontology_file_path) seq_ontology._id = "so-xp" repo.insertOntology(seq_ontology) repo.addOntology(seq_ontology) vcf_directory = os.path.dirname('/mnt/ga4gh/data/vcf/') pattern = os.path.join(vcf_directory, "*.vcf.gz") for vcfFile in glob.glob(pattern): name = vcfFile.replace("/mnt/ga4gh/data/vcf/", "") name = name.replace(".annotated.nh2.variants.vcf.gz", "") print(name) variant_set = variants.HtslibVariantSet(dataset, name) variant_set.setReferenceSet(reference_set) variant_set.populateFromFile([vcfFile], [vcfFile + ".tbi"]) variant_set.checkConsistency() for call_set in variant_set.getCallSets(): for bio_sample in new_bio_samples: if bio_sample.getLocalId() == call_set.getLocalId(): call_set.setBioSampleId(bio_sample.getId()) repo.insertVariantSet(variant_set) name = name + "-annotated-nh2" print(name) variant_set2 = variants.HtslibVariantSet(dataset, name) variant_set2.setReferenceSet(reference_set) variant_set2.populateFromFile([vcfFile], [vcfFile + ".tbi"]) variant_set2.checkConsistency() repo.insertVariantSet(variant_set2) for annotation_set in variant_set2.getVariantAnnotationSets(): print(str(annotation_set) + "found") annotation_set.setOntology(seq_ontology) repo.insertVariantAnnotationSet(annotation_set) repo.commit() print("database filled!")
def setUp(self): self._repo = datarepo.SqlDataRepository(paths.testDataRepo) self._repo.open(datarepo.MODE_READ) self._backend = backend.Backend(self._repo) self._client = client.LocalClient(self._backend)
def setUp(self): registryLocation = 'registry.db' dataRepository = datarepo.SqlDataRepository(registryLocation) dataRepository.open(datarepo.MODE_READ) self._backend = backend.Backend(dataRepository) self._client = client.LocalClient(self._backend)
def __init__(self, registryDb): repo = datarepo.SqlDataRepository(registryDb) repo.open(datarepo.MODE_READ) super(CpuProfilerBackend, self).__init__(repo) self.profiler = cProfile.Profile()
def __init__(self, registryDb): repo = datarepo.SqlDataRepository(registryDb) repo.open(datarepo.MODE_READ) super(HeapProfilerBackend, self).__init__(repo) self.profiler = guppy.hpy()
def testDirectory(self): repoPath = makeTempDir() repo = datarepo.SqlDataRepository(repoPath) with self.assertRaises(exceptions.RepoInvalidDatabaseException): repo.open(datarepo.MODE_READ)
def __init__(self, args): self._args = args self._registryPath = args.registryPath self._repo = datarepo.SqlDataRepository(self._registryPath)
def setUp(self): self._dataRepo = datarepo.SqlDataRepository(paths.testDataRepo) self._dataRepo.open(datarepo.MODE_READ)
def readRepo(self): repo = datarepo.SqlDataRepository(self._repoPath) repo.open(datarepo.MODE_READ) return repo
def testNonexistantFile(self): repo = datarepo.SqlDataRepository("aFilePathThatDoesNotExist") with self.assertRaises(exceptions.RepoNotFoundException): repo.open(datarepo.MODE_READ)
""") args = parser.parse_args() registryDb = "ga4gh-example-data/registry.db" if args.profile == 'heap': backendClass = HeapProfilerBackend backend = backendClass(registryDb) args.repeatLimit = 1 args.pageLimit = 1 elif args.profile == 'cpu': backendClass = CpuProfilerBackend backend = backendClass(registryDb) else: repo = datarepo.SqlDataRepository(registryDb) repo.open(datarepo.MODE_READ) backend = backend.Backend(repo) # Get our list of callSetids callSetIds = args.callSetIds if callSetIds != []: callSetIds = None if args.callSetIds != "*": callSetIds = args.callSetIds.split(",") minTime = benchmarkOneQuery(_heavyQuery(args.variantSetId, callSetIds), args.repeatLimit, args.pageLimit) print(minTime) if args.profile == 'cpu': stats = pstats.Stats(backend.profiler)