Exemplo n.º 1
0
 def testRightVersion(self):
     repo = datarepo.SqlDataRepository(self._repoPath)
     repo.open(datarepo.MODE_WRITE)
     repo.initialise()
     anotherRepo = datarepo.SqlDataRepository(self._repoPath)
     anotherRepo.open(datarepo.MODE_READ)
     self.assertEquals(anotherRepo._schemaVersion, str(repo.version))
Exemplo n.º 2
0
 def testWrongVersion(self):
     repo = datarepo.SqlDataRepository(self._repoPath)
     repo.version = datarepo.SqlDataRepository.SchemaVersion(
         "wrong.version")
     repo.open(datarepo.MODE_WRITE)
     repo.initialise()
     anotherRepo = datarepo.SqlDataRepository(self._repoPath)
     with self.assertRaises(exceptions.RepoSchemaVersionMismatchException):
         anotherRepo.open(datarepo.MODE_READ)
Exemplo n.º 3
0
 def setUp(self):
     dataRepository = datarepo.SqlDataRepository(paths.testDataRepo)
     dataRepository.open(datarepo.MODE_READ)
     self.backend = backend.Backend(dataRepository)
     self.dataset = self.backend.getDataRepository().getDatasets()[0]
     self.dataset_id = self.dataset.getId()
     self.access_map = {self.dataset.getLocalId(): 4}
Exemplo n.º 4
0
 def setUp(self):
     self._maxDiff = None
     repoPath = paths.testDataRepo
     self._dataUrl = moduleTestServer.getUrl()
     dataRepository = datarepo.SqlDataRepository(repoPath)
     dataRepository.open(datarepo.MODE_READ)
     self._backend = backend.Backend(dataRepository)
     self._client = client.LocalClient(self._backend)
Exemplo n.º 5
0
 def __init__(self, rnaQuantificationLocalId, baseDir):
     self._dataset = datasets.Dataset(_datasetName)
     self._repo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._repo.open(datarepo.MODE_READ)
     self._referenceSet = references.AbstractReferenceSet("test_rs")
     rnaQuantificationId = rnaQuantificationLocalId[:-3]  # remove '.db'
     super(RnaQuantificationTest, self).__init__(rnaQuantificationId,
                                                 baseDir)
Exemplo n.º 6
0
def _configure_backend(app):
    """A helper function used just to help modularize the code a bit."""
    # Allocate the backend
    # We use URLs to specify the backend. Currently we have file:// URLs (or
    # URLs with no scheme) for the SqlDataRepository, and special empty:// and
    # simulated:// URLs for empty or simulated data sources.
    dataSource = urlparse.urlparse(app.config["DATA_SOURCE"], "file")

    if dataSource.scheme == "simulated":
        # Ignore the query string
        randomSeed = app.config["SIMULATED_BACKEND_RANDOM_SEED"]
        numCalls = app.config["SIMULATED_BACKEND_NUM_CALLS"]
        variantDensity = app.config["SIMULATED_BACKEND_VARIANT_DENSITY"]
        numVariantSets = app.config["SIMULATED_BACKEND_NUM_VARIANT_SETS"]
        numReferenceSets = app.config["SIMULATED_BACKEND_NUM_REFERENCE_SETS"]
        numReferencesPerReferenceSet = app.config[
            "SIMULATED_BACKEND_NUM_REFERENCES_PER_REFERENCE_SET"]
        numAlignmentsPerReadGroup = app.config[
            "SIMULATED_BACKEND_NUM_ALIGNMENTS_PER_READ_GROUP"]
        numReadGroupsPerReadGroupSet = app.config[
            "SIMULATED_BACKEND_NUM_READ_GROUPS_PER_READ_GROUP_SET"]
        numPhenotypeAssociations = app.config[
            "SIMULATED_BACKEND_NUM_PHENOTYPE_ASSOCIATIONS"]
        numPhenotypeAssociationSets = app.config[
            "SIMULATED_BACKEND_NUM_PHENOTYPE_ASSOCIATION_SETS"]
        numRnaQuantSets = app.config[
            "SIMULATED_BACKEND_NUM_RNA_QUANTIFICATION_SETS"]
        numExpressionLevels = app.config[
            "SIMULATED_BACKEND_NUM_EXPRESSION_LEVELS_PER_RNA_QUANT_SET"]

        dataRepository = datarepo.SimulatedDataRepository(
            randomSeed=randomSeed,
            numCalls=numCalls,
            variantDensity=variantDensity,
            numVariantSets=numVariantSets,
            numReferenceSets=numReferenceSets,
            numReferencesPerReferenceSet=numReferencesPerReferenceSet,
            numReadGroupsPerReadGroupSet=numReadGroupsPerReadGroupSet,
            numAlignments=numAlignmentsPerReadGroup,
            numPhenotypeAssociations=numPhenotypeAssociations,
            numPhenotypeAssociationSets=numPhenotypeAssociationSets,
            numRnaQuantSets=numRnaQuantSets,
            numExpressionLevels=numExpressionLevels)
    elif dataSource.scheme == "empty":
        dataRepository = datarepo.EmptyDataRepository()
    elif dataSource.scheme == "file":
        path = os.path.join(dataSource.netloc, dataSource.path)
        dataRepository = datarepo.SqlDataRepository(path)
        dataRepository.open(datarepo.MODE_READ)
    else:
        raise exceptions.ConfigurationException(
            "Unsupported data source scheme: " + dataSource.scheme)
    theBackend = backend.Backend(dataRepository)
    theBackend.setRequestValidation(app.config["REQUEST_VALIDATION"])
    theBackend.setDefaultPageSize(app.config["DEFAULT_PAGE_SIZE"])
    theBackend.setMaxResponseLength(app.config["MAX_RESPONSE_LENGTH"])
    return theBackend
Exemplo n.º 7
0
 def _createContinuousSet(self):
     """
     Creates a ContinuousSet from the specified directory.
     """
     self._continuousSetName = "testContinuous"
     self._repo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._repo.open(datarepo.MODE_READ)
     self._dataset = datasets.Dataset("testDs")
     self._continuousSet = continuous.readSet(
         self._dataset, self._continuousSetName)
Exemplo n.º 8
0
 def __init__(self, featureSetLocalName, dataPath):
     """
     :param localId: Name of the GFF3 resource corresponding to a pair
     of files, .db and .gff3
     :param dataPath: string representing full path to the .db file
     :return:
     """
     self._dataset = datasets.Dataset(_datasetName)
     self._repo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._repo.open(datarepo.MODE_READ)
     self._ontology = self._repo.getOntologyByName(paths.ontologyName)
     self._referenceSet = references.AbstractReferenceSet("test_rs")
     featureSetLocalName = featureSetLocalName[:-3]  # remove '.db'
     self._testData = _testDataForFeatureSetName[featureSetLocalName]
     super(FeatureSetTests, self).__init__(featureSetLocalName, dataPath)
 def _createVariantAnnotationSet(self, vcfDir):
     """
     Creates a VariantAnnotationSet from the specified directory of
     VCF files.
     """
     self._variantSetName = "testVariantSet"
     self._repo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._repo.open(datarepo.MODE_READ)
     self._dataset = datasets.Dataset("testDs")
     self._variantSet = variants.HtslibVariantSet(
         self._dataset, self._variantSetName)
     self._variantSet.populateFromDirectory(vcfDir)
     self._variantAnnotationSet = variants.HtslibVariantAnnotationSet(
         self._variantSet, "testVAs")
     self._variantAnnotationSet.setOntology(
         self._repo.getOntologyByName(paths.ontologyName))
Exemplo n.º 10
0
    def __init__(self, inputDirectory, outputDirectory, force):
        """
        Converts human readable dataset from compliance repository,
        and translates it into a reference-server readable filesystem
        with binary files.
        :param inputDirectory: location of
            the human readable compliance dataset
        :param outputDirectory: location of
            the file hierarchy suitable for deploying on the reference server
        """
        self.inputDirectory = inputDirectory
        self.outputDirectory = outputDirectory
        self.repoPath = os.path.abspath(
            os.path.join(outputDirectory, "registry.db"))
        self.tempdir = None

        if os.path.exists(self.outputDirectory):
            if force:
                utils.log(
                    "Removing existing output directory at '{}'".format(
                        self.outputDirectory))
                shutil.rmtree(self.outputDirectory)
            else:
                utils.log(
                    "Output directory '{}' already exists".format(
                        self.outputDirectory))
                utils.log(
                    "Please specify an output path that does not exist")
                utils.log("Exiting...")
                exit(1)

        # If no input directory is specified download from GitHub
        if inputDirectory is None:
            utils.log("Downloading test data...")
            self.tempdir = tempfile.mkdtemp()
            assert(os.path.exists(self.tempdir))
            url = "https://github.com/ga4gh/compliance/archive/master.zip"
            filePath = os.path.join(self.tempdir, 'compliance-master.zip')
            downloader = file_downloader.HttpFileDownloader(url, filePath)
            downloader.download()
            utils.log("Extracting test data...")
            with zipfile.ZipFile(filePath, "r") as z:
                z.extractall(self.tempdir)
            self.inputDirectory = os.path.join(
                self.tempdir, 'compliance-master', 'test-data')
        repo = datarepo.SqlDataRepository(self.repoPath)
        self.repo = repo
    def createRepo(self):
        """
        Creates the repository for all the data we've just downloaded.
        """
        repo = datarepo.SqlDataRepository(self.repoPath)
        repo.open("w")
        repo.initialise()

        referenceSet = references.HtslibReferenceSet("GRCh37-subset")
        referenceSet.populateFromFile(self.fastaFilePath)
        referenceSet.setDescription("Subset of GRCh37 used for demonstration")
        referenceSet.setSpeciesFromJson(
            '{"id": "9606",' +
            '"term": "H**o sapiens", "source_name": "NCBI"}')
        for reference in referenceSet.getReferences():
            reference.setSpeciesFromJson(
                '{"id": "9606",' +
                '"term": "H**o sapiens", "source_name": "NCBI"}')
            reference.setSourceAccessions(
                self.accessions[reference.getName()] + ".subset")
        repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("1kg-p3-subset")
        dataset.setDescription("Sample data from 1000 Genomes phase 3")
        repo.insertDataset(dataset)

        variantSet = variants.HtslibVariantSet(dataset, "mvncall")
        variantSet.setReferenceSet(referenceSet)
        dataUrls = [vcfFile for vcfFile, _ in self.vcfFilePaths]
        indexFiles = [indexFile for _, indexFile in self.vcfFilePaths]
        variantSet.populateFromFile(dataUrls, indexFiles)
        variantSet.checkConsistency()
        repo.insertVariantSet(variantSet)

        for sample, (bamFile, indexFile) in zip(self.samples,
                                                self.bamFilePaths):
            readGroupSet = reads.HtslibReadGroupSet(dataset, sample)
            readGroupSet.populateFromFile(bamFile, indexFile)
            readGroupSet.setReferenceSet(referenceSet)
            repo.insertReadGroupSet(readGroupSet)

        repo.commit()
        repo.close()
        self.log("Finished creating the repository; summary:\n")
        repo.open("r")
        repo.printSummary()
Exemplo n.º 12
0
 def testTextFile(self):
     with open(self._repoPath, 'w') as textFile:
         textFile.write('This is now a text file')
     repo = datarepo.SqlDataRepository(self._repoPath)
     with self.assertRaises(exceptions.RepoInvalidDatabaseException):
         repo.open(datarepo.MODE_READ)
Exemplo n.º 13
0
 def testDbFileWithoutTables(self):
     repo = datarepo.SqlDataRepository(self._repoPath)
     with self.assertRaises(exceptions.RepoInvalidDatabaseException):
         repo.open(datarepo.MODE_READ)
Exemplo n.º 14
0
def main():

    # Set for using hg38 rather than hg19
    # reference_set_path = '/mnt/ga4gh/repo_data/hg38.fa.gz'
    reference_set_path = '/mnt/ga4gh/repo_data/hs37d5.fa.gz'

    bio_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.Biosample.tsv'
    ind_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.individual.tsv'

    bio_samples = parse_file_biosamples(bio_tsv_location)
    individuals = parse_file_individuals(ind_tsv_location)
    repoPath = os.path.join("repo2.db")
    repo = datarepo.SqlDataRepository(repoPath)
    if (os.path.isfile("repo2.db") == True):
        os.system("rm repo2.db")
    repo.open("w")
    repo.initialise()

    dataset = datasets.Dataset("Simons")
    dataset.setDescription(
        "Variants from the Simons Foundation Genome Diversity Project")
    repo.insertDataset(dataset)

    print("Inserting biosamples")
    new_bio_samples = []
    for bio_sample in bio_samples:
        new_bio_sample = biodata.Biosample(
            dataset, unicode(bio_sample['name'], errors='replace'))
        new_bio_sample.populateFromJson(json.dumps(bio_sample))
        repo.insertBiosample(new_bio_sample)
        new_bio_samples.append(new_bio_sample)

    print("Inserting individuals")
    new_individuals = []
    for individual in individuals:
        new_individual = biodata.Individual(
            dataset, unicode(individual['name'], errors='replace'))
        new_individual.populateFromJson(json.dumps(individual))
        repo.insertIndividual(new_individual)
        new_individuals.append(new_individual)

    print("Adding reference set (takes a while)")
    reference_set = references.HtslibReferenceSet("NCBI37")
    reference_set.populateFromFile(reference_set_path)
    reference_set.setDescription("NCBI37 assembly of the human genome")
    reference_set.setNcbiTaxonId(9606)
    reference_set.setSourceUri(
        "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
    )
    for reference in reference_set.getReferences():
        reference.setNcbiTaxonId(9606)
    repo.insertReferenceSet(reference_set)

    seq_ontology = ontologies.Ontology("/mnt/ga4gh/repo_data/so-xp")
    ontology_file_path = '/mnt/ga4gh/repo_data/so-xp-simple.obo'
    seq_ontology.populateFromFile(ontology_file_path)
    seq_ontology._id = "so-xp"
    repo.insertOntology(seq_ontology)
    repo.addOntology(seq_ontology)

    vcf_directory = os.path.dirname('/mnt/ga4gh/data/vcf/')
    pattern = os.path.join(vcf_directory, "*.vcf.gz")
    for vcfFile in glob.glob(pattern):
        name = vcfFile.replace("/mnt/ga4gh/data/vcf/", "")
        name = name.replace(".annotated.nh2.variants.vcf.gz", "")
        print(name)
        variant_set = variants.HtslibVariantSet(dataset, name)
        variant_set.setReferenceSet(reference_set)
        variant_set.populateFromFile([vcfFile], [vcfFile + ".tbi"])
        variant_set.checkConsistency()
        for call_set in variant_set.getCallSets():
            for bio_sample in new_bio_samples:
                if bio_sample.getLocalId() == call_set.getLocalId():
                    call_set.setBioSampleId(bio_sample.getId())

        repo.insertVariantSet(variant_set)

        name = name + "-annotated-nh2"
        print(name)
        variant_set2 = variants.HtslibVariantSet(dataset, name)
        variant_set2.setReferenceSet(reference_set)
        variant_set2.populateFromFile([vcfFile], [vcfFile + ".tbi"])
        variant_set2.checkConsistency()
        repo.insertVariantSet(variant_set2)
        for annotation_set in variant_set2.getVariantAnnotationSets():
            print(str(annotation_set) + "found")
            annotation_set.setOntology(seq_ontology)
            repo.insertVariantAnnotationSet(annotation_set)

    repo.commit()
    print("database filled!")
Exemplo n.º 15
0
 def setUp(self):
     self._repo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._repo.open(datarepo.MODE_READ)
     self._backend = backend.Backend(self._repo)
     self._client = client.LocalClient(self._backend)
Exemplo n.º 16
0
 def setUp(self):
     registryLocation = 'registry.db'
     dataRepository = datarepo.SqlDataRepository(registryLocation)
     dataRepository.open(datarepo.MODE_READ)
     self._backend = backend.Backend(dataRepository)
     self._client = client.LocalClient(self._backend)
Exemplo n.º 17
0
 def __init__(self, registryDb):
     repo = datarepo.SqlDataRepository(registryDb)
     repo.open(datarepo.MODE_READ)
     super(CpuProfilerBackend, self).__init__(repo)
     self.profiler = cProfile.Profile()
Exemplo n.º 18
0
 def __init__(self, registryDb):
     repo = datarepo.SqlDataRepository(registryDb)
     repo.open(datarepo.MODE_READ)
     super(HeapProfilerBackend, self).__init__(repo)
     self.profiler = guppy.hpy()
Exemplo n.º 19
0
 def testDirectory(self):
     repoPath = makeTempDir()
     repo = datarepo.SqlDataRepository(repoPath)
     with self.assertRaises(exceptions.RepoInvalidDatabaseException):
         repo.open(datarepo.MODE_READ)
Exemplo n.º 20
0
 def __init__(self, args):
     self._args = args
     self._registryPath = args.registryPath
     self._repo = datarepo.SqlDataRepository(self._registryPath)
Exemplo n.º 21
0
 def setUp(self):
     self._dataRepo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._dataRepo.open(datarepo.MODE_READ)
Exemplo n.º 22
0
 def readRepo(self):
     repo = datarepo.SqlDataRepository(self._repoPath)
     repo.open(datarepo.MODE_READ)
     return repo
Exemplo n.º 23
0
 def testNonexistantFile(self):
     repo = datarepo.SqlDataRepository("aFilePathThatDoesNotExist")
     with self.assertRaises(exceptions.RepoNotFoundException):
         repo.open(datarepo.MODE_READ)
Exemplo n.º 24
0
            """)

    args = parser.parse_args()

    registryDb = "ga4gh-example-data/registry.db"

    if args.profile == 'heap':
        backendClass = HeapProfilerBackend
        backend = backendClass(registryDb)
        args.repeatLimit = 1
        args.pageLimit = 1
    elif args.profile == 'cpu':
        backendClass = CpuProfilerBackend
        backend = backendClass(registryDb)
    else:
        repo = datarepo.SqlDataRepository(registryDb)
        repo.open(datarepo.MODE_READ)
        backend = backend.Backend(repo)
    # Get our list of callSetids
    callSetIds = args.callSetIds
    if callSetIds != []:
        callSetIds = None
        if args.callSetIds != "*":
            callSetIds = args.callSetIds.split(",")

    minTime = benchmarkOneQuery(_heavyQuery(args.variantSetId, callSetIds),
                                args.repeatLimit, args.pageLimit)
    print(minTime)

    if args.profile == 'cpu':
        stats = pstats.Stats(backend.profiler)