def getDataModelInstance(self, localId, dataPath): # Return a VA set if it is one if self._isAnnotated(): self._variantSet = variants.HtslibVariantSet( self._dataset, "vs", self._dataPath, None) return variants.HtslibVariantAnnotationSet(self._dataset, localId, dataPath, self._datarepo, self._variantSet) else: return variants.HtslibVariantSet(self._dataset, localId, dataPath, None)
def testInstantiation(self): for localId in self.localIds: path = self.getFullPath(localId) variantSet = variants.HtslibVariantSet(self.dataset, localId, path, None) with self.assertRaises(exceptions.InconsistentMetaDataException): variantSet.checkConsistency()
def __init__(self, dataDir): localId = os.path.basename(os.path.normpath(dataDir)) super(FileSystemDataset, self).__init__(localId) # Variants variantSetDir = os.path.join(dataDir, "variants") for localId in os.listdir(variantSetDir): relativePath = os.path.join(variantSetDir, localId) if os.path.isdir(relativePath): variantSet = variants.HtslibVariantSet( self, localId, relativePath) self._variantSetIdMap[variantSet.getId()] = variantSet self._variantSetIds = sorted(self._variantSetIdMap.keys()) # Reads readGroupSetDir = os.path.join(dataDir, "reads") for localId in os.listdir(readGroupSetDir): relativePath = os.path.join(readGroupSetDir, localId) if os.path.isdir(relativePath): readGroupSet = reads.HtslibReadGroupSet( self, localId, relativePath) self._readGroupSetIdMap[readGroupSet.getId()] = readGroupSet for readGroup in readGroupSet.getReadGroups(): self._readGroupIdMap[readGroup.getId()] = readGroup self._readGroupSetIds = sorted(self._readGroupSetIdMap.keys()) self._readGroupIds = sorted(self._readGroupIdMap.keys())
def testInstantiation(self): for localId in self.localIds: path = self.getFullPath(localId) variantSet = variants.HtslibVariantSet(self.dataset, localId) variantSet.populateFromDirectory(path) with self.assertRaises(exceptions.InconsistentCallSetIdException): variantSet.checkConsistency()
def __init__(self, datasetDir): super(FileSystemDataset, self).__init__() self._id = os.path.basename(os.path.normpath(datasetDir)) self._datasetDir = datasetDir # Variants variantSetDir = os.path.join(self._datasetDir, "variants") for variantSetId in os.listdir(variantSetDir): compoundVsid = '{}:{}'.format(self._id, variantSetId) relativePath = os.path.join(variantSetDir, variantSetId) if os.path.isdir(relativePath): self._variantSetIdMap[compoundVsid] = \ variants.HtslibVariantSet( compoundVsid, relativePath) self._variantSetIds = sorted(self._variantSetIdMap.keys()) # Reads readGroupSetDir = os.path.join(self._datasetDir, "reads") for readGroupSetId in os.listdir(readGroupSetDir): compoundRgsid = '{}:{}'.format(self._id, readGroupSetId) relativePath = os.path.join(readGroupSetDir, readGroupSetId) if os.path.isdir(relativePath): readGroupSet = reads.HtslibReadGroupSet( compoundRgsid, relativePath) self._readGroupSetIdMap[compoundRgsid] = readGroupSet for readGroup in readGroupSet.getReadGroups(): self._readGroupIdMap[readGroup.getId()] = readGroup self._readGroupSetIds = sorted(self._readGroupSetIdMap.keys()) self._readGroupIds = sorted(self._readGroupIdMap.keys())
def _readVariantSetTable(self, cursor): cursor.row_factory = sqlite3.Row cursor.execute("SELECT * FROM VariantSet;") for row in cursor: dataset = self.getDataset(row[b'datasetId']) referenceSet = self.getReferenceSet(row[b'referenceSetId']) variantSet = variants.HtslibVariantSet(dataset, row[b'name']) variantSet.setReferenceSet(referenceSet) variantSet.populateFromRow(row) assert variantSet.getId() == row[b'id'] # Insert the variantSet into the memory-based object model. dataset.addVariantSet(variantSet)
def getDataModelInstance(self, localId, dataPath): dataset = datasets.Dataset("ds") variantSet = variants.HtslibVariantSet(dataset, localId) variantSet.populateFromDirectory(dataPath) referenceSet = references.AbstractReferenceSet("rs") variantSet.setReferenceSet(referenceSet) if variantSet.isAnnotated(): sequenceOntology = ontologies.Ontology(paths.ontologyName) sequenceOntology.populateFromFile(paths.ontologyPath) annotationSet = variantSet.getVariantAnnotationSets()[0] annotationSet.setOntology(sequenceOntology) return annotationSet else: return variantSet
def __init__(self, variantAnnotationSetId, baseDir): self._dataset = datasets.AbstractDataset("ds") self._datarepo = datarepo.FileSystemDataRepository("tests/data") super(VariantAnnotationSetTest, self).__init__(variantAnnotationSetId, baseDir) self._variantSet = variants.HtslibVariantSet(self._dataset, "vs", self._dataPath, None) self._variantRecords = [] self._referenceNames = set() # Only read in VCF files with a JSON sidecar saying they're annotated. for vcfFile in glob.glob(os.path.join(self._dataPath, "*.vcf.gz")): if self._isAnnotated(): self._readVcf(vcfFile) self._isCsq = self._hasConsequenceField()
def _createVariantAnnotationSet(self, vcfDir): """ Creates a VariantAnnotationSet from the specified directory of VCF files. """ self._variantSetName = "testVariantSet" self._repo = datarepo.SqlDataRepository(paths.testDataRepo) self._repo.open(datarepo.MODE_READ) self._dataset = datasets.Dataset("testDs") self._variantSet = variants.HtslibVariantSet( self._dataset, self._variantSetName) self._variantSet.populateFromDirectory(vcfDir) self._variantAnnotationSet = variants.HtslibVariantAnnotationSet( self._variantSet, "testVAs") self._variantAnnotationSet.setOntology( self._repo.getOntologyByName(paths.ontologyName))
def createRepo(self): """ Creates the repository for all the data we've just downloaded. """ repo = datarepo.SqlDataRepository(self.repoPath) repo.open("w") repo.initialise() referenceSet = references.HtslibReferenceSet("GRCh37-subset") referenceSet.populateFromFile(self.fastaFilePath) referenceSet.setDescription("Subset of GRCh37 used for demonstration") referenceSet.setNcbiTaxonId(9606) for reference in referenceSet.getReferences(): reference.setNcbiTaxonId(9606) reference.setSourceAccessions( self.accessions[reference.getName()] + ".subset") repo.insertReferenceSet(referenceSet) dataset = datasets.Dataset("1kg-p3-subset") dataset.setDescription("Sample data from 1000 Genomes phase 3") repo.insertDataset(dataset) variantSet = variants.HtslibVariantSet(dataset, "mvncall") variantSet.setReferenceSet(referenceSet) dataUrls = [vcfFile for vcfFile, _ in self.vcfFilePaths] indexFiles = [indexFile for _, indexFile in self.vcfFilePaths] variantSet.populateFromFile(dataUrls, indexFiles) variantSet.checkConsistency() repo.insertVariantSet(variantSet) for sample, (bamFile, indexFile) in zip(self.samples, self.bamFilePaths): readGroupSet = reads.HtslibReadGroupSet(dataset, sample) readGroupSet.populateFromFile(bamFile, indexFile) readGroupSet.setReferenceSet(referenceSet) repo.insertReadGroupSet(readGroupSet) repo.commit() repo.close() self.log("Finished creating the repository; summary:\n") repo.open("r") repo.printSummary()
def addVariantSet(self, variantFileName, dataset, referenceSet, ontology, bioSamples): inputVcf = os.path.join(self.inputDirectory, variantFileName) outputVcf = os.path.join(self.outputDirectory, variantFileName) shutil.copy(inputVcf, outputVcf) pysam.tabix_index(outputVcf, preset="vcf") variantSet = variants.HtslibVariantSet(dataset, variantFileName.split('_')[1]) variantSet.setReferenceSet(referenceSet) variantSet.populateFromFile([outputVcf + ".gz"], [outputVcf + ".gz.tbi"]) variantSet.checkConsistency() for callSet in variantSet.getCallSets(): for bioSample in bioSamples: if bioSample.getLocalId() == callSet.getLocalId(): callSet.setBioSampleId(bioSample.getId()) self.repo.insertVariantSet(variantSet) for annotationSet in variantSet.getVariantAnnotationSets(): annotationSet.setOntology(ontology) self.repo.insertVariantAnnotationSet(annotationSet)
def __init__(self, localId, dataDir, backend): super(FileSystemDataset, self).__init__(localId) # Variants variantSetDir = os.path.join(dataDir, "variants") for localId in os.listdir(variantSetDir): relativePath = os.path.join(variantSetDir, localId) if os.path.isdir(relativePath): variantSet = variants.HtslibVariantSet(self, localId, relativePath, backend) self.addVariantSet(variantSet) # Reads readGroupSetDir = os.path.join(dataDir, "reads") for filename in os.listdir(readGroupSetDir): if fnmatch.fnmatch(filename, '*.bam'): localId, _ = os.path.splitext(filename) bamPath = os.path.join(readGroupSetDir, filename) readGroupSet = reads.HtslibReadGroupSet( self, localId, bamPath, backend) self.addReadGroupSet(readGroupSet)
def __init__(self, localId, dataDir, dataRepository): super(FileSystemDataset, self).__init__(localId) self._dataDir = dataDir self._setMetadata() # Variants variantSetDir = os.path.join(dataDir, self.variantsDirName) for localId in os.listdir(variantSetDir): relativePath = os.path.join(variantSetDir, localId) if os.path.isdir(relativePath): variantSet = variants.HtslibVariantSet(self, localId, relativePath, dataRepository) self.addVariantSet(variantSet) # Variant annotations sets if variantSet.isAnnotated(relativePath): variantAnnotationSet = variants.HtslibVariantAnnotationSet( self, localId, relativePath, dataRepository, variantSet) self.addVariantAnnotationSet(variantAnnotationSet) # Reads readGroupSetDir = os.path.join(dataDir, self.readsDirName) for filename in os.listdir(readGroupSetDir): if fnmatch.fnmatch(filename, '*.bam'): localId, _ = os.path.splitext(filename) bamPath = os.path.join(readGroupSetDir, filename) readGroupSet = reads.HtslibReadGroupSet( self, localId, bamPath, dataRepository) self.addReadGroupSet(readGroupSet) # Sequence Annotations featureSetDir = os.path.join(dataDir, self.featuresDirName) for filename in os.listdir(featureSetDir): if fnmatch.fnmatch(filename, '*.db'): localId, _ = os.path.splitext(filename) fullPath = os.path.join(featureSetDir, filename) featureSet = sequenceAnnotations.Gff3DbFeatureSet( self, localId, fullPath, dataRepository) self.addFeatureSet(featureSet)
def __init__(self, dataDir): super(FileSystemBackend, self).__init__() self._dataDir = dataDir # TODO this code is very ugly and should be regarded as a temporary # stop-gap until we deal with iterating over the data tree properly. # Variants variantSetDir = os.path.join(self._dataDir, "variants") for variantSetId in os.listdir(variantSetDir): relativePath = os.path.join(variantSetDir, variantSetId) if os.path.isdir(relativePath): self._variantSetIdMap[variantSetId] = \ variants.HtslibVariantSet(variantSetId, relativePath) self._variantSetIds = sorted(self._variantSetIdMap.keys()) # References referenceSetDir = os.path.join(self._dataDir, "references") for referenceSetId in os.listdir(referenceSetDir): relativePath = os.path.join(referenceSetDir, referenceSetId) if os.path.isdir(relativePath): referenceSet = references.ReferenceSet( referenceSetId, relativePath) self._referenceSetIdMap[referenceSetId] = referenceSet self._referenceSetIds = sorted(self._referenceSetIdMap.keys()) # Reads readGroupSetDir = os.path.join(self._dataDir, "reads") for readGroupSetId in os.listdir(readGroupSetDir): relativePath = os.path.join(readGroupSetDir, readGroupSetId) if os.path.isdir(relativePath): readGroupSet = reads.HtslibReadGroupSet( readGroupSetId, relativePath) self._readGroupSetIdMap[readGroupSetId] = readGroupSet for readGroup in readGroupSet.getReadGroups(): self._readGroupIdMap[readGroup.getId()] = readGroup self._readGroupSetIds = sorted(self._readGroupSetIdMap.keys()) self._readGroupIds = sorted(self._readGroupIdMap.keys())
def testInstantiation(self): for localId in self.localIds: path = self.getFullPath(localId) with self.assertRaises(exceptions.OverlappingVcfException): variants.HtslibVariantSet(self.dataset, localId, path, None)
def testInstantiation(self): for localId in self.localIds: path = self.getFullPath(localId) with self.assertRaises(exceptions.InconsistentCallSetIdException): variants.HtslibVariantSet(self.dataset, localId, path, None)
def testInstantiation(self): for localId in self.localIds: path = self.getFullPath(localId) variantSet = variants.HtslibVariantSet(self.dataset, localId) with self.assertRaises(exceptions.OverlappingVcfException): variantSet.populateFromDirectory(path)
def getDataModelInstance(self, localId, dataPath): return variants.HtslibVariantSet(self._dataset, localId, dataPath, None)
def getDataModelInstance(self, localId, dataPath): variantSet = variants.HtslibVariantSet(self._dataset, localId) variantSet.populateFromDirectory(dataPath) referenceSet = references.AbstractReferenceSet("test") variantSet.setReferenceSet(referenceSet) return variantSet
def addVariantSet(self): """ Adds a new VariantSet into this repo. """ self._openRepo() dataset = self._repo.getDatasetByName(self._args.datasetName) dataUrls = self._args.dataFiles name = self._args.name if len(dataUrls) == 1: if self._args.name is None: name = getNameFromPath(dataUrls[0]) if os.path.isdir(dataUrls[0]): # Read in the VCF files from the directory. # TODO support uncompressed VCF and BCF files vcfDir = dataUrls[0] pattern = os.path.join(vcfDir, "*.vcf.gz") dataUrls = glob.glob(pattern) if len(dataUrls) == 0: raise exceptions.RepoManagerException( "Cannot find any VCF files in the directory " "'{}'.".format(vcfDir)) dataUrls[0] = self._getFilePath(dataUrls[0], self._args.relativePath) elif self._args.name is None: raise exceptions.RepoManagerException( "Cannot infer the intended name of the VariantSet when " "more than one VCF file is provided. Please provide a " "name argument using --name.") parsed = urlparse.urlparse(dataUrls[0]) if parsed.scheme not in ['http', 'ftp']: dataUrls = map(lambda url: self._getFilePath( url, self._args.relativePath), dataUrls) # Now, get the index files for the data files that we've now obtained. indexFiles = self._args.indexFiles if indexFiles is None: # First check if all the paths exist locally, as they must # if we are making a default index path. for dataUrl in dataUrls: if not os.path.exists(dataUrl): raise exceptions.MissingIndexException( "Cannot find file '{}'. All variant files must be " "stored locally if the default index location is " "used. If you are trying to create a VariantSet " "based on remote URLs, please download the index " "files to the local file system and provide them " "with the --indexFiles argument".format(dataUrl)) # We assume that the indexes are made by adding .tbi indexSuffix = ".tbi" # TODO support BCF input properly here by adding .csi indexFiles = [filename + indexSuffix for filename in dataUrls] indexFiles = map(lambda url: self._getFilePath( url, self._args.relativePath), indexFiles) variantSet = variants.HtslibVariantSet(dataset, name) variantSet.populateFromFile(dataUrls, indexFiles) # Get the reference set that is associated with the variant set. referenceSetName = self._args.referenceSetName if referenceSetName is None: # Try to find a reference set name from the VCF header. referenceSetName = variantSet.getVcfHeaderReferenceSetName() if referenceSetName is None: raise exceptions.RepoManagerException( "Cannot infer the ReferenceSet from the VCF header. Please " "specify the ReferenceSet to associate with this " "VariantSet using the --referenceSetName option") referenceSet = self._repo.getReferenceSetByName(referenceSetName) variantSet.setReferenceSet(referenceSet) # Now check for annotations annotationSets = [] if variantSet.isAnnotated() and self._args.addAnnotationSets: ontologyName = self._args.ontologyName if ontologyName is None: raise exceptions.RepoManagerException( "A sequence ontology name must be provided") ontology = self._repo.getOntologyByName(ontologyName) self._checkSequenceOntology(ontology) for annotationSet in variantSet.getVariantAnnotationSets(): annotationSet.setOntology(ontology) annotationSets.append(annotationSet) # Add the annotation sets and the variant set as an atomic update def updateRepo(): self._repo.insertVariantSet(variantSet) for annotationSet in annotationSets: self._repo.insertVariantAnnotationSet(annotationSet) self._updateRepo(updateRepo)