예제 #1
0
 def testInstantiation(self):
     for localId in self.localIds:
         path = self.getFullPath(localId)
         variantSet = variants.HtslibVariantSet(self.dataset, localId)
         variantSet.populateFromDirectory(path)
         with self.assertRaises(exceptions.InconsistentCallSetIdException):
             variantSet.checkConsistency()
예제 #2
0
    def addVariantSet(
            self, variantFileName, dataset, referenceSet,
            ontology, biosamples):
        inputVcf = os.path.join(
            self.inputDirectory, variantFileName)
        outputVcf = os.path.join(
            self.outputDirectory, variantFileName)
        shutil.copy(inputVcf, outputVcf)
        pysam.tabix_index(outputVcf, preset="vcf")
        variantSet = variants.HtslibVariantSet(
            dataset, variantFileName.split('_')[1])
        variantSet.setReferenceSet(referenceSet)
        variantSet.populateFromFile(
            [os.path.abspath(outputVcf + ".gz")],
            [os.path.abspath(outputVcf + ".gz.tbi")])
        variantSet.checkConsistency()
        for callSet in variantSet.getCallSets():
            for biosample in biosamples:
                if biosample.getLocalId() == callSet.getLocalId():
                    callSet.setBiosampleId(biosample.getId())
        self.repo.insertVariantSet(variantSet)

        for annotationSet in variantSet.getVariantAnnotationSets():
            annotationSet.setOntology(ontology)
            self.repo.insertVariantAnnotationSet(annotationSet)
예제 #3
0
파일: datarepo.py 프로젝트: ejacox/server
 def _readVariantSetTable(self):
     for variantSetRecord in m.Variantset.select():
         dataset = self.getDataset(variantSetRecord.datasetid.id)
         referenceSet = self.getReferenceSet(
             variantSetRecord.referencesetid.id)
         variantSet = variants.HtslibVariantSet(
             dataset, variantSetRecord.name)
         variantSet.setReferenceSet(referenceSet)
         variantSet.populateFromRow(variantSetRecord)
         assert variantSet.getId() == variantSetRecord.id
         # Insert the variantSet into the memory-based object model.
         dataset.addVariantSet(variantSet)
예제 #4
0
 def getDataModelInstance(self, localId, dataPath):
     dataset = datasets.Dataset("ds")
     variantSet = variants.HtslibVariantSet(dataset, localId)
     variantSet.populateFromDirectory(dataPath)
     referenceSet = references.AbstractReferenceSet("rs")
     variantSet.setReferenceSet(referenceSet)
     if variantSet.isAnnotated():
         sequenceOntology = ontologies.Ontology(paths.ontologyName)
         sequenceOntology.populateFromFile(paths.ontologyPath)
         annotationSet = variantSet.getVariantAnnotationSets()[0]
         annotationSet.setOntology(sequenceOntology)
         return annotationSet
     else:
         return variantSet
 def _createVariantAnnotationSet(self, vcfDir):
     """
     Creates a VariantAnnotationSet from the specified directory of
     VCF files.
     """
     self._variantSetName = "testVariantSet"
     self._repo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._repo.open(datarepo.MODE_READ)
     self._dataset = datasets.Dataset("testDs")
     self._variantSet = variants.HtslibVariantSet(
         self._dataset, self._variantSetName)
     self._variantSet.populateFromDirectory(vcfDir)
     self._variantAnnotationSet = variants.HtslibVariantAnnotationSet(
         self._variantSet, "testVAs")
     self._variantAnnotationSet.setOntology(
         self._repo.getOntologyByName(paths.ontologyName))
    def createRepo(self):
        """
        Creates the repository for all the data we've just downloaded.
        """
        repo = datarepo.SqlDataRepository(self.repoPath)
        repo.open("w")
        repo.initialise()

        referenceSet = references.HtslibReferenceSet("GRCh37-subset")
        referenceSet.populateFromFile(self.fastaFilePath)
        referenceSet.setDescription("Subset of GRCh37 used for demonstration")
        referenceSet.setSpeciesFromJson(
            '{"id": "9606",' +
            '"term": "H**o sapiens", "source_name": "NCBI"}')
        for reference in referenceSet.getReferences():
            reference.setSpeciesFromJson(
                '{"id": "9606",' +
                '"term": "H**o sapiens", "source_name": "NCBI"}')
            reference.setSourceAccessions(
                self.accessions[reference.getName()] + ".subset")
        repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("1kg-p3-subset")
        dataset.setDescription("Sample data from 1000 Genomes phase 3")
        repo.insertDataset(dataset)

        variantSet = variants.HtslibVariantSet(dataset, "mvncall")
        variantSet.setReferenceSet(referenceSet)
        dataUrls = [vcfFile for vcfFile, _ in self.vcfFilePaths]
        indexFiles = [indexFile for _, indexFile in self.vcfFilePaths]
        variantSet.populateFromFile(dataUrls, indexFiles)
        variantSet.checkConsistency()
        repo.insertVariantSet(variantSet)

        for sample, (bamFile, indexFile) in zip(self.samples,
                                                self.bamFilePaths):
            readGroupSet = reads.HtslibReadGroupSet(dataset, sample)
            readGroupSet.populateFromFile(bamFile, indexFile)
            readGroupSet.setReferenceSet(referenceSet)
            repo.insertReadGroupSet(readGroupSet)

        repo.commit()
        repo.close()
        self.log("Finished creating the repository; summary:\n")
        repo.open("r")
        repo.printSummary()
예제 #7
0
 def testInstantiation(self):
     for localId in self.localIds:
         path = self.getFullPath(localId)
         variantSet = variants.HtslibVariantSet(self.dataset, localId)
         with self.assertRaises(exceptions.OverlappingVcfException):
             variantSet.populateFromDirectory(path)
예제 #8
0
    def addVariantSet(self):
        """
        Adds a new VariantSet into this repo.
        """
        self._openRepo()
        dataset = self._repo.getDatasetByName(self._args.datasetName)
        dataUrls = self._args.dataFiles
        name = self._args.name
        if len(dataUrls) == 1:
            if self._args.name is None:
                name = getNameFromPath(dataUrls[0])
            if os.path.isdir(dataUrls[0]):
                # Read in the VCF files from the directory.
                # TODO support uncompressed VCF and BCF files
                vcfDir = dataUrls[0]
                pattern = os.path.join(vcfDir, "*.vcf.gz")
                dataUrls = glob.glob(pattern)
                if len(dataUrls) == 0:
                    raise exceptions.RepoManagerException(
                        "Cannot find any VCF files in the directory "
                        "'{}'.".format(vcfDir))
                dataUrls[0] = self._getFilePath(dataUrls[0],
                                                self._args.relativePath)
        elif self._args.name is None:
            raise exceptions.RepoManagerException(
                "Cannot infer the intended name of the VariantSet when "
                "more than one VCF file is provided. Please provide a "
                "name argument using --name.")
        parsed = urlparse.urlparse(dataUrls[0])
        if parsed.scheme not in ['http', 'ftp']:
            dataUrls = map(lambda url: self._getFilePath(
                url, self._args.relativePath), dataUrls)
        # Now, get the index files for the data files that we've now obtained.
        indexFiles = self._args.indexFiles
        if indexFiles is None:
            # First check if all the paths exist locally, as they must
            # if we are making a default index path.
            for dataUrl in dataUrls:
                if not os.path.exists(dataUrl):
                    raise exceptions.MissingIndexException(
                        "Cannot find file '{}'. All variant files must be "
                        "stored locally if the default index location is "
                        "used. If you are trying to create a VariantSet "
                        "based on remote URLs, please download the index "
                        "files to the local file system and provide them "
                        "with the --indexFiles argument".format(dataUrl))
            # We assume that the indexes are made by adding .tbi
            indexSuffix = ".tbi"
            # TODO support BCF input properly here by adding .csi
            indexFiles = [filename + indexSuffix for filename in dataUrls]
        indexFiles = map(lambda url: self._getFilePath(
            url, self._args.relativePath), indexFiles)
        variantSet = variants.HtslibVariantSet(dataset, name)
        variantSet.populateFromFile(dataUrls, indexFiles)
        # Get the reference set that is associated with the variant set.
        referenceSetName = self._args.referenceSetName
        if referenceSetName is None:
            # Try to find a reference set name from the VCF header.
            referenceSetName = variantSet.getVcfHeaderReferenceSetName()
        if referenceSetName is None:
            raise exceptions.RepoManagerException(
                "Cannot infer the ReferenceSet from the VCF header. Please "
                "specify the ReferenceSet to associate with this "
                "VariantSet using the --referenceSetName option")
        referenceSet = self._repo.getReferenceSetByName(referenceSetName)
        variantSet.setReferenceSet(referenceSet)
        variantSet.setAttributes(json.loads(self._args.attributes))
        # Now check for annotations
        annotationSets = []
        if variantSet.isAnnotated() and self._args.addAnnotationSets:
            ontologyName = self._args.ontologyName
            if ontologyName is None:
                raise exceptions.RepoManagerException(
                    "A sequence ontology name must be provided")
            ontology = self._repo.getOntologyByName(ontologyName)
            self._checkSequenceOntology(ontology)
            for annotationSet in variantSet.getVariantAnnotationSets():
                annotationSet.setOntology(ontology)
                annotationSets.append(annotationSet)

        # Add the annotation sets and the variant set as an atomic update
        def updateRepo():
            self._repo.insertVariantSet(variantSet)
            for annotationSet in annotationSets:
                self._repo.insertVariantAnnotationSet(annotationSet)
        self._updateRepo(updateRepo)
예제 #9
0
 def getDataModelInstance(self, localId, dataPath):
     variantSet = variants.HtslibVariantSet(self._dataset, localId)
     variantSet.populateFromDirectory(dataPath)
     referenceSet = references.AbstractReferenceSet("test")
     variantSet.setReferenceSet(referenceSet)
     return variantSet
예제 #10
0
def main():

    # Set for using hg38 rather than hg19
    # reference_set_path = '/mnt/ga4gh/repo_data/hg38.fa.gz'
    reference_set_path = '/mnt/ga4gh/repo_data/hs37d5.fa.gz'

    bio_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.Biosample.tsv'
    ind_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.individual.tsv'

    bio_samples = parse_file_biosamples(bio_tsv_location)
    individuals = parse_file_individuals(ind_tsv_location)
    repoPath = os.path.join("repo2.db")
    repo = datarepo.SqlDataRepository(repoPath)
    if (os.path.isfile("repo2.db") == True):
        os.system("rm repo2.db")
    repo.open("w")
    repo.initialise()

    dataset = datasets.Dataset("Simons")
    dataset.setDescription(
        "Variants from the Simons Foundation Genome Diversity Project")
    repo.insertDataset(dataset)

    print("Inserting biosamples")
    new_bio_samples = []
    for bio_sample in bio_samples:
        new_bio_sample = biodata.Biosample(
            dataset, unicode(bio_sample['name'], errors='replace'))
        new_bio_sample.populateFromJson(json.dumps(bio_sample))
        repo.insertBiosample(new_bio_sample)
        new_bio_samples.append(new_bio_sample)

    print("Inserting individuals")
    new_individuals = []
    for individual in individuals:
        new_individual = biodata.Individual(
            dataset, unicode(individual['name'], errors='replace'))
        new_individual.populateFromJson(json.dumps(individual))
        repo.insertIndividual(new_individual)
        new_individuals.append(new_individual)

    print("Adding reference set (takes a while)")
    reference_set = references.HtslibReferenceSet("NCBI37")
    reference_set.populateFromFile(reference_set_path)
    reference_set.setDescription("NCBI37 assembly of the human genome")
    reference_set.setNcbiTaxonId(9606)
    reference_set.setSourceUri(
        "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz"
    )
    for reference in reference_set.getReferences():
        reference.setNcbiTaxonId(9606)
    repo.insertReferenceSet(reference_set)

    seq_ontology = ontologies.Ontology("/mnt/ga4gh/repo_data/so-xp")
    ontology_file_path = '/mnt/ga4gh/repo_data/so-xp-simple.obo'
    seq_ontology.populateFromFile(ontology_file_path)
    seq_ontology._id = "so-xp"
    repo.insertOntology(seq_ontology)
    repo.addOntology(seq_ontology)

    vcf_directory = os.path.dirname('/mnt/ga4gh/data/vcf/')
    pattern = os.path.join(vcf_directory, "*.vcf.gz")
    for vcfFile in glob.glob(pattern):
        name = vcfFile.replace("/mnt/ga4gh/data/vcf/", "")
        name = name.replace(".annotated.nh2.variants.vcf.gz", "")
        print(name)
        variant_set = variants.HtslibVariantSet(dataset, name)
        variant_set.setReferenceSet(reference_set)
        variant_set.populateFromFile([vcfFile], [vcfFile + ".tbi"])
        variant_set.checkConsistency()
        for call_set in variant_set.getCallSets():
            for bio_sample in new_bio_samples:
                if bio_sample.getLocalId() == call_set.getLocalId():
                    call_set.setBioSampleId(bio_sample.getId())

        repo.insertVariantSet(variant_set)

        name = name + "-annotated-nh2"
        print(name)
        variant_set2 = variants.HtslibVariantSet(dataset, name)
        variant_set2.setReferenceSet(reference_set)
        variant_set2.populateFromFile([vcfFile], [vcfFile + ".tbi"])
        variant_set2.checkConsistency()
        repo.insertVariantSet(variant_set2)
        for annotation_set in variant_set2.getVariantAnnotationSets():
            print(str(annotation_set) + "found")
            annotation_set.setOntology(seq_ontology)
            repo.insertVariantAnnotationSet(annotation_set)

    repo.commit()
    print("database filled!")