Пример #1
0
 def getDataModelInstance(self, localId, dataPath):
     # Return a VA set if it is one
     if self._isAnnotated():
         self._variantSet = variants.HtslibVariantSet(
             self._dataset, "vs", self._dataPath, None)
         return variants.HtslibVariantAnnotationSet(self._dataset, localId,
                                                    dataPath,
                                                    self._datarepo,
                                                    self._variantSet)
     else:
         return variants.HtslibVariantSet(self._dataset, localId, dataPath,
                                          None)
Пример #2
0
 def testInstantiation(self):
     for localId in self.localIds:
         path = self.getFullPath(localId)
         variantSet = variants.HtslibVariantSet(self.dataset, localId, path,
                                                None)
         with self.assertRaises(exceptions.InconsistentMetaDataException):
             variantSet.checkConsistency()
Пример #3
0
    def __init__(self, dataDir):
        localId = os.path.basename(os.path.normpath(dataDir))
        super(FileSystemDataset, self).__init__(localId)

        # Variants
        variantSetDir = os.path.join(dataDir, "variants")
        for localId in os.listdir(variantSetDir):
            relativePath = os.path.join(variantSetDir, localId)
            if os.path.isdir(relativePath):
                variantSet = variants.HtslibVariantSet(
                    self, localId, relativePath)
                self._variantSetIdMap[variantSet.getId()] = variantSet
        self._variantSetIds = sorted(self._variantSetIdMap.keys())

        # Reads
        readGroupSetDir = os.path.join(dataDir, "reads")
        for localId in os.listdir(readGroupSetDir):
            relativePath = os.path.join(readGroupSetDir, localId)
            if os.path.isdir(relativePath):
                readGroupSet = reads.HtslibReadGroupSet(
                    self, localId, relativePath)
                self._readGroupSetIdMap[readGroupSet.getId()] = readGroupSet
                for readGroup in readGroupSet.getReadGroups():
                    self._readGroupIdMap[readGroup.getId()] = readGroup
        self._readGroupSetIds = sorted(self._readGroupSetIdMap.keys())
        self._readGroupIds = sorted(self._readGroupIdMap.keys())
Пример #4
0
 def testInstantiation(self):
     for localId in self.localIds:
         path = self.getFullPath(localId)
         variantSet = variants.HtslibVariantSet(self.dataset, localId)
         variantSet.populateFromDirectory(path)
         with self.assertRaises(exceptions.InconsistentCallSetIdException):
             variantSet.checkConsistency()
Пример #5
0
    def __init__(self, datasetDir):
        super(FileSystemDataset, self).__init__()
        self._id = os.path.basename(os.path.normpath(datasetDir))
        self._datasetDir = datasetDir

        # Variants
        variantSetDir = os.path.join(self._datasetDir, "variants")
        for variantSetId in os.listdir(variantSetDir):
            compoundVsid = '{}:{}'.format(self._id, variantSetId)
            relativePath = os.path.join(variantSetDir, variantSetId)
            if os.path.isdir(relativePath):
                self._variantSetIdMap[compoundVsid] = \
                    variants.HtslibVariantSet(
                        compoundVsid, relativePath)
        self._variantSetIds = sorted(self._variantSetIdMap.keys())

        # Reads
        readGroupSetDir = os.path.join(self._datasetDir, "reads")
        for readGroupSetId in os.listdir(readGroupSetDir):
            compoundRgsid = '{}:{}'.format(self._id, readGroupSetId)
            relativePath = os.path.join(readGroupSetDir, readGroupSetId)
            if os.path.isdir(relativePath):
                readGroupSet = reads.HtslibReadGroupSet(
                    compoundRgsid, relativePath)
                self._readGroupSetIdMap[compoundRgsid] = readGroupSet
                for readGroup in readGroupSet.getReadGroups():
                    self._readGroupIdMap[readGroup.getId()] = readGroup
        self._readGroupSetIds = sorted(self._readGroupSetIdMap.keys())
        self._readGroupIds = sorted(self._readGroupIdMap.keys())
Пример #6
0
 def _readVariantSetTable(self, cursor):
     cursor.row_factory = sqlite3.Row
     cursor.execute("SELECT * FROM VariantSet;")
     for row in cursor:
         dataset = self.getDataset(row[b'datasetId'])
         referenceSet = self.getReferenceSet(row[b'referenceSetId'])
         variantSet = variants.HtslibVariantSet(dataset, row[b'name'])
         variantSet.setReferenceSet(referenceSet)
         variantSet.populateFromRow(row)
         assert variantSet.getId() == row[b'id']
         # Insert the variantSet into the memory-based object model.
         dataset.addVariantSet(variantSet)
Пример #7
0
 def getDataModelInstance(self, localId, dataPath):
     dataset = datasets.Dataset("ds")
     variantSet = variants.HtslibVariantSet(dataset, localId)
     variantSet.populateFromDirectory(dataPath)
     referenceSet = references.AbstractReferenceSet("rs")
     variantSet.setReferenceSet(referenceSet)
     if variantSet.isAnnotated():
         sequenceOntology = ontologies.Ontology(paths.ontologyName)
         sequenceOntology.populateFromFile(paths.ontologyPath)
         annotationSet = variantSet.getVariantAnnotationSets()[0]
         annotationSet.setOntology(sequenceOntology)
         return annotationSet
     else:
         return variantSet
Пример #8
0
 def __init__(self, variantAnnotationSetId, baseDir):
     self._dataset = datasets.AbstractDataset("ds")
     self._datarepo = datarepo.FileSystemDataRepository("tests/data")
     super(VariantAnnotationSetTest, self).__init__(variantAnnotationSetId,
                                                    baseDir)
     self._variantSet = variants.HtslibVariantSet(self._dataset, "vs",
                                                  self._dataPath, None)
     self._variantRecords = []
     self._referenceNames = set()
     # Only read in VCF files with a JSON sidecar saying they're annotated.
     for vcfFile in glob.glob(os.path.join(self._dataPath, "*.vcf.gz")):
         if self._isAnnotated():
             self._readVcf(vcfFile)
     self._isCsq = self._hasConsequenceField()
Пример #9
0
 def _createVariantAnnotationSet(self, vcfDir):
     """
     Creates a VariantAnnotationSet from the specified directory of
     VCF files.
     """
     self._variantSetName = "testVariantSet"
     self._repo = datarepo.SqlDataRepository(paths.testDataRepo)
     self._repo.open(datarepo.MODE_READ)
     self._dataset = datasets.Dataset("testDs")
     self._variantSet = variants.HtslibVariantSet(
         self._dataset, self._variantSetName)
     self._variantSet.populateFromDirectory(vcfDir)
     self._variantAnnotationSet = variants.HtslibVariantAnnotationSet(
         self._variantSet, "testVAs")
     self._variantAnnotationSet.setOntology(
         self._repo.getOntologyByName(paths.ontologyName))
Пример #10
0
    def createRepo(self):
        """
        Creates the repository for all the data we've just downloaded.
        """
        repo = datarepo.SqlDataRepository(self.repoPath)
        repo.open("w")
        repo.initialise()

        referenceSet = references.HtslibReferenceSet("GRCh37-subset")
        referenceSet.populateFromFile(self.fastaFilePath)
        referenceSet.setDescription("Subset of GRCh37 used for demonstration")
        referenceSet.setNcbiTaxonId(9606)
        for reference in referenceSet.getReferences():
            reference.setNcbiTaxonId(9606)
            reference.setSourceAccessions(
                self.accessions[reference.getName()] + ".subset")
        repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("1kg-p3-subset")
        dataset.setDescription("Sample data from 1000 Genomes phase 3")
        repo.insertDataset(dataset)

        variantSet = variants.HtslibVariantSet(dataset, "mvncall")
        variantSet.setReferenceSet(referenceSet)
        dataUrls = [vcfFile for vcfFile, _ in self.vcfFilePaths]
        indexFiles = [indexFile for _, indexFile in self.vcfFilePaths]
        variantSet.populateFromFile(dataUrls, indexFiles)
        variantSet.checkConsistency()
        repo.insertVariantSet(variantSet)

        for sample, (bamFile, indexFile) in zip(self.samples,
                                                self.bamFilePaths):
            readGroupSet = reads.HtslibReadGroupSet(dataset, sample)
            readGroupSet.populateFromFile(bamFile, indexFile)
            readGroupSet.setReferenceSet(referenceSet)
            repo.insertReadGroupSet(readGroupSet)

        repo.commit()
        repo.close()
        self.log("Finished creating the repository; summary:\n")
        repo.open("r")
        repo.printSummary()
Пример #11
0
 def addVariantSet(self, variantFileName, dataset, referenceSet, ontology,
                   bioSamples):
     inputVcf = os.path.join(self.inputDirectory, variantFileName)
     outputVcf = os.path.join(self.outputDirectory, variantFileName)
     shutil.copy(inputVcf, outputVcf)
     pysam.tabix_index(outputVcf, preset="vcf")
     variantSet = variants.HtslibVariantSet(dataset,
                                            variantFileName.split('_')[1])
     variantSet.setReferenceSet(referenceSet)
     variantSet.populateFromFile([outputVcf + ".gz"],
                                 [outputVcf + ".gz.tbi"])
     variantSet.checkConsistency()
     for callSet in variantSet.getCallSets():
         for bioSample in bioSamples:
             if bioSample.getLocalId() == callSet.getLocalId():
                 callSet.setBioSampleId(bioSample.getId())
     self.repo.insertVariantSet(variantSet)
     for annotationSet in variantSet.getVariantAnnotationSets():
         annotationSet.setOntology(ontology)
         self.repo.insertVariantAnnotationSet(annotationSet)
Пример #12
0
    def __init__(self, localId, dataDir, backend):
        super(FileSystemDataset, self).__init__(localId)

        # Variants
        variantSetDir = os.path.join(dataDir, "variants")
        for localId in os.listdir(variantSetDir):
            relativePath = os.path.join(variantSetDir, localId)
            if os.path.isdir(relativePath):
                variantSet = variants.HtslibVariantSet(self, localId,
                                                       relativePath, backend)
                self.addVariantSet(variantSet)
        # Reads
        readGroupSetDir = os.path.join(dataDir, "reads")
        for filename in os.listdir(readGroupSetDir):
            if fnmatch.fnmatch(filename, '*.bam'):
                localId, _ = os.path.splitext(filename)
                bamPath = os.path.join(readGroupSetDir, filename)
                readGroupSet = reads.HtslibReadGroupSet(
                    self, localId, bamPath, backend)
                self.addReadGroupSet(readGroupSet)
Пример #13
0
    def __init__(self, localId, dataDir, dataRepository):
        super(FileSystemDataset, self).__init__(localId)
        self._dataDir = dataDir
        self._setMetadata()

        # Variants
        variantSetDir = os.path.join(dataDir, self.variantsDirName)
        for localId in os.listdir(variantSetDir):
            relativePath = os.path.join(variantSetDir, localId)
            if os.path.isdir(relativePath):
                variantSet = variants.HtslibVariantSet(self, localId,
                                                       relativePath,
                                                       dataRepository)
                self.addVariantSet(variantSet)
                # Variant annotations sets
                if variantSet.isAnnotated(relativePath):
                    variantAnnotationSet = variants.HtslibVariantAnnotationSet(
                        self, localId, relativePath, dataRepository,
                        variantSet)
                    self.addVariantAnnotationSet(variantAnnotationSet)

        # Reads
        readGroupSetDir = os.path.join(dataDir, self.readsDirName)
        for filename in os.listdir(readGroupSetDir):
            if fnmatch.fnmatch(filename, '*.bam'):
                localId, _ = os.path.splitext(filename)
                bamPath = os.path.join(readGroupSetDir, filename)
                readGroupSet = reads.HtslibReadGroupSet(
                    self, localId, bamPath, dataRepository)
                self.addReadGroupSet(readGroupSet)
        # Sequence Annotations
        featureSetDir = os.path.join(dataDir, self.featuresDirName)
        for filename in os.listdir(featureSetDir):
            if fnmatch.fnmatch(filename, '*.db'):
                localId, _ = os.path.splitext(filename)
                fullPath = os.path.join(featureSetDir, filename)
                featureSet = sequenceAnnotations.Gff3DbFeatureSet(
                    self, localId, fullPath, dataRepository)
                self.addFeatureSet(featureSet)
Пример #14
0
    def __init__(self, dataDir):
        super(FileSystemBackend, self).__init__()
        self._dataDir = dataDir
        # TODO this code is very ugly and should be regarded as a temporary
        # stop-gap until we deal with iterating over the data tree properly.
        # Variants
        variantSetDir = os.path.join(self._dataDir, "variants")
        for variantSetId in os.listdir(variantSetDir):
            relativePath = os.path.join(variantSetDir, variantSetId)
            if os.path.isdir(relativePath):
                self._variantSetIdMap[variantSetId] = \
                    variants.HtslibVariantSet(variantSetId, relativePath)
        self._variantSetIds = sorted(self._variantSetIdMap.keys())

        # References
        referenceSetDir = os.path.join(self._dataDir, "references")
        for referenceSetId in os.listdir(referenceSetDir):
            relativePath = os.path.join(referenceSetDir, referenceSetId)
            if os.path.isdir(relativePath):
                referenceSet = references.ReferenceSet(
                    referenceSetId, relativePath)
                self._referenceSetIdMap[referenceSetId] = referenceSet
        self._referenceSetIds = sorted(self._referenceSetIdMap.keys())

        # Reads
        readGroupSetDir = os.path.join(self._dataDir, "reads")
        for readGroupSetId in os.listdir(readGroupSetDir):
            relativePath = os.path.join(readGroupSetDir, readGroupSetId)
            if os.path.isdir(relativePath):
                readGroupSet = reads.HtslibReadGroupSet(
                    readGroupSetId, relativePath)
                self._readGroupSetIdMap[readGroupSetId] = readGroupSet
                for readGroup in readGroupSet.getReadGroups():
                    self._readGroupIdMap[readGroup.getId()] = readGroup
        self._readGroupSetIds = sorted(self._readGroupSetIdMap.keys())
        self._readGroupIds = sorted(self._readGroupIdMap.keys())
Пример #15
0
 def testInstantiation(self):
     for localId in self.localIds:
         path = self.getFullPath(localId)
         with self.assertRaises(exceptions.OverlappingVcfException):
             variants.HtslibVariantSet(self.dataset, localId, path, None)
Пример #16
0
 def testInstantiation(self):
     for localId in self.localIds:
         path = self.getFullPath(localId)
         with self.assertRaises(exceptions.InconsistentCallSetIdException):
             variants.HtslibVariantSet(self.dataset, localId, path, None)
Пример #17
0
 def testInstantiation(self):
     for localId in self.localIds:
         path = self.getFullPath(localId)
         variantSet = variants.HtslibVariantSet(self.dataset, localId)
         with self.assertRaises(exceptions.OverlappingVcfException):
             variantSet.populateFromDirectory(path)
Пример #18
0
 def getDataModelInstance(self, localId, dataPath):
     return variants.HtslibVariantSet(self._dataset, localId, dataPath,
                                      None)
Пример #19
0
 def getDataModelInstance(self, localId, dataPath):
     variantSet = variants.HtslibVariantSet(self._dataset, localId)
     variantSet.populateFromDirectory(dataPath)
     referenceSet = references.AbstractReferenceSet("test")
     variantSet.setReferenceSet(referenceSet)
     return variantSet
Пример #20
0
    def addVariantSet(self):
        """
        Adds a new VariantSet into this repo.
        """
        self._openRepo()
        dataset = self._repo.getDatasetByName(self._args.datasetName)
        dataUrls = self._args.dataFiles
        name = self._args.name
        if len(dataUrls) == 1:
            if self._args.name is None:
                name = getNameFromPath(dataUrls[0])
            if os.path.isdir(dataUrls[0]):
                # Read in the VCF files from the directory.
                # TODO support uncompressed VCF and BCF files
                vcfDir = dataUrls[0]
                pattern = os.path.join(vcfDir, "*.vcf.gz")
                dataUrls = glob.glob(pattern)
                if len(dataUrls) == 0:
                    raise exceptions.RepoManagerException(
                        "Cannot find any VCF files in the directory "
                        "'{}'.".format(vcfDir))
                dataUrls[0] = self._getFilePath(dataUrls[0],
                                                self._args.relativePath)
        elif self._args.name is None:
            raise exceptions.RepoManagerException(
                "Cannot infer the intended name of the VariantSet when "
                "more than one VCF file is provided. Please provide a "
                "name argument using --name.")
        parsed = urlparse.urlparse(dataUrls[0])
        if parsed.scheme not in ['http', 'ftp']:
            dataUrls = map(lambda url: self._getFilePath(
                url, self._args.relativePath), dataUrls)
        # Now, get the index files for the data files that we've now obtained.
        indexFiles = self._args.indexFiles
        if indexFiles is None:
            # First check if all the paths exist locally, as they must
            # if we are making a default index path.
            for dataUrl in dataUrls:
                if not os.path.exists(dataUrl):
                    raise exceptions.MissingIndexException(
                        "Cannot find file '{}'. All variant files must be "
                        "stored locally if the default index location is "
                        "used. If you are trying to create a VariantSet "
                        "based on remote URLs, please download the index "
                        "files to the local file system and provide them "
                        "with the --indexFiles argument".format(dataUrl))
            # We assume that the indexes are made by adding .tbi
            indexSuffix = ".tbi"
            # TODO support BCF input properly here by adding .csi
            indexFiles = [filename + indexSuffix for filename in dataUrls]
        indexFiles = map(lambda url: self._getFilePath(
            url, self._args.relativePath), indexFiles)
        variantSet = variants.HtslibVariantSet(dataset, name)
        variantSet.populateFromFile(dataUrls, indexFiles)
        # Get the reference set that is associated with the variant set.
        referenceSetName = self._args.referenceSetName
        if referenceSetName is None:
            # Try to find a reference set name from the VCF header.
            referenceSetName = variantSet.getVcfHeaderReferenceSetName()
        if referenceSetName is None:
            raise exceptions.RepoManagerException(
                "Cannot infer the ReferenceSet from the VCF header. Please "
                "specify the ReferenceSet to associate with this "
                "VariantSet using the --referenceSetName option")
        referenceSet = self._repo.getReferenceSetByName(referenceSetName)
        variantSet.setReferenceSet(referenceSet)

        # Now check for annotations
        annotationSets = []
        if variantSet.isAnnotated() and self._args.addAnnotationSets:
            ontologyName = self._args.ontologyName
            if ontologyName is None:
                raise exceptions.RepoManagerException(
                    "A sequence ontology name must be provided")
            ontology = self._repo.getOntologyByName(ontologyName)
            self._checkSequenceOntology(ontology)
            for annotationSet in variantSet.getVariantAnnotationSets():
                annotationSet.setOntology(ontology)
                annotationSets.append(annotationSet)

        # Add the annotation sets and the variant set as an atomic update
        def updateRepo():
            self._repo.insertVariantSet(variantSet)
            for annotationSet in annotationSets:
                self._repo.insertVariantAnnotationSet(annotationSet)
        self._updateRepo(updateRepo)