def testInstantiation(self): for localId in self.localIds: path = self.getFullPath(localId) variantSet = variants.HtslibVariantSet(self.dataset, localId) variantSet.populateFromDirectory(path) with self.assertRaises(exceptions.InconsistentCallSetIdException): variantSet.checkConsistency()
def addVariantSet( self, variantFileName, dataset, referenceSet, ontology, biosamples): inputVcf = os.path.join( self.inputDirectory, variantFileName) outputVcf = os.path.join( self.outputDirectory, variantFileName) shutil.copy(inputVcf, outputVcf) pysam.tabix_index(outputVcf, preset="vcf") variantSet = variants.HtslibVariantSet( dataset, variantFileName.split('_')[1]) variantSet.setReferenceSet(referenceSet) variantSet.populateFromFile( [os.path.abspath(outputVcf + ".gz")], [os.path.abspath(outputVcf + ".gz.tbi")]) variantSet.checkConsistency() for callSet in variantSet.getCallSets(): for biosample in biosamples: if biosample.getLocalId() == callSet.getLocalId(): callSet.setBiosampleId(biosample.getId()) self.repo.insertVariantSet(variantSet) for annotationSet in variantSet.getVariantAnnotationSets(): annotationSet.setOntology(ontology) self.repo.insertVariantAnnotationSet(annotationSet)
def _readVariantSetTable(self): for variantSetRecord in m.Variantset.select(): dataset = self.getDataset(variantSetRecord.datasetid.id) referenceSet = self.getReferenceSet( variantSetRecord.referencesetid.id) variantSet = variants.HtslibVariantSet( dataset, variantSetRecord.name) variantSet.setReferenceSet(referenceSet) variantSet.populateFromRow(variantSetRecord) assert variantSet.getId() == variantSetRecord.id # Insert the variantSet into the memory-based object model. dataset.addVariantSet(variantSet)
def getDataModelInstance(self, localId, dataPath): dataset = datasets.Dataset("ds") variantSet = variants.HtslibVariantSet(dataset, localId) variantSet.populateFromDirectory(dataPath) referenceSet = references.AbstractReferenceSet("rs") variantSet.setReferenceSet(referenceSet) if variantSet.isAnnotated(): sequenceOntology = ontologies.Ontology(paths.ontologyName) sequenceOntology.populateFromFile(paths.ontologyPath) annotationSet = variantSet.getVariantAnnotationSets()[0] annotationSet.setOntology(sequenceOntology) return annotationSet else: return variantSet
def _createVariantAnnotationSet(self, vcfDir): """ Creates a VariantAnnotationSet from the specified directory of VCF files. """ self._variantSetName = "testVariantSet" self._repo = datarepo.SqlDataRepository(paths.testDataRepo) self._repo.open(datarepo.MODE_READ) self._dataset = datasets.Dataset("testDs") self._variantSet = variants.HtslibVariantSet( self._dataset, self._variantSetName) self._variantSet.populateFromDirectory(vcfDir) self._variantAnnotationSet = variants.HtslibVariantAnnotationSet( self._variantSet, "testVAs") self._variantAnnotationSet.setOntology( self._repo.getOntologyByName(paths.ontologyName))
def createRepo(self): """ Creates the repository for all the data we've just downloaded. """ repo = datarepo.SqlDataRepository(self.repoPath) repo.open("w") repo.initialise() referenceSet = references.HtslibReferenceSet("GRCh37-subset") referenceSet.populateFromFile(self.fastaFilePath) referenceSet.setDescription("Subset of GRCh37 used for demonstration") referenceSet.setSpeciesFromJson( '{"id": "9606",' + '"term": "H**o sapiens", "source_name": "NCBI"}') for reference in referenceSet.getReferences(): reference.setSpeciesFromJson( '{"id": "9606",' + '"term": "H**o sapiens", "source_name": "NCBI"}') reference.setSourceAccessions( self.accessions[reference.getName()] + ".subset") repo.insertReferenceSet(referenceSet) dataset = datasets.Dataset("1kg-p3-subset") dataset.setDescription("Sample data from 1000 Genomes phase 3") repo.insertDataset(dataset) variantSet = variants.HtslibVariantSet(dataset, "mvncall") variantSet.setReferenceSet(referenceSet) dataUrls = [vcfFile for vcfFile, _ in self.vcfFilePaths] indexFiles = [indexFile for _, indexFile in self.vcfFilePaths] variantSet.populateFromFile(dataUrls, indexFiles) variantSet.checkConsistency() repo.insertVariantSet(variantSet) for sample, (bamFile, indexFile) in zip(self.samples, self.bamFilePaths): readGroupSet = reads.HtslibReadGroupSet(dataset, sample) readGroupSet.populateFromFile(bamFile, indexFile) readGroupSet.setReferenceSet(referenceSet) repo.insertReadGroupSet(readGroupSet) repo.commit() repo.close() self.log("Finished creating the repository; summary:\n") repo.open("r") repo.printSummary()
def testInstantiation(self): for localId in self.localIds: path = self.getFullPath(localId) variantSet = variants.HtslibVariantSet(self.dataset, localId) with self.assertRaises(exceptions.OverlappingVcfException): variantSet.populateFromDirectory(path)
def addVariantSet(self): """ Adds a new VariantSet into this repo. """ self._openRepo() dataset = self._repo.getDatasetByName(self._args.datasetName) dataUrls = self._args.dataFiles name = self._args.name if len(dataUrls) == 1: if self._args.name is None: name = getNameFromPath(dataUrls[0]) if os.path.isdir(dataUrls[0]): # Read in the VCF files from the directory. # TODO support uncompressed VCF and BCF files vcfDir = dataUrls[0] pattern = os.path.join(vcfDir, "*.vcf.gz") dataUrls = glob.glob(pattern) if len(dataUrls) == 0: raise exceptions.RepoManagerException( "Cannot find any VCF files in the directory " "'{}'.".format(vcfDir)) dataUrls[0] = self._getFilePath(dataUrls[0], self._args.relativePath) elif self._args.name is None: raise exceptions.RepoManagerException( "Cannot infer the intended name of the VariantSet when " "more than one VCF file is provided. Please provide a " "name argument using --name.") parsed = urlparse.urlparse(dataUrls[0]) if parsed.scheme not in ['http', 'ftp']: dataUrls = map(lambda url: self._getFilePath( url, self._args.relativePath), dataUrls) # Now, get the index files for the data files that we've now obtained. indexFiles = self._args.indexFiles if indexFiles is None: # First check if all the paths exist locally, as they must # if we are making a default index path. for dataUrl in dataUrls: if not os.path.exists(dataUrl): raise exceptions.MissingIndexException( "Cannot find file '{}'. All variant files must be " "stored locally if the default index location is " "used. If you are trying to create a VariantSet " "based on remote URLs, please download the index " "files to the local file system and provide them " "with the --indexFiles argument".format(dataUrl)) # We assume that the indexes are made by adding .tbi indexSuffix = ".tbi" # TODO support BCF input properly here by adding .csi indexFiles = [filename + indexSuffix for filename in dataUrls] indexFiles = map(lambda url: self._getFilePath( url, self._args.relativePath), indexFiles) variantSet = variants.HtslibVariantSet(dataset, name) variantSet.populateFromFile(dataUrls, indexFiles) # Get the reference set that is associated with the variant set. referenceSetName = self._args.referenceSetName if referenceSetName is None: # Try to find a reference set name from the VCF header. referenceSetName = variantSet.getVcfHeaderReferenceSetName() if referenceSetName is None: raise exceptions.RepoManagerException( "Cannot infer the ReferenceSet from the VCF header. Please " "specify the ReferenceSet to associate with this " "VariantSet using the --referenceSetName option") referenceSet = self._repo.getReferenceSetByName(referenceSetName) variantSet.setReferenceSet(referenceSet) variantSet.setAttributes(json.loads(self._args.attributes)) # Now check for annotations annotationSets = [] if variantSet.isAnnotated() and self._args.addAnnotationSets: ontologyName = self._args.ontologyName if ontologyName is None: raise exceptions.RepoManagerException( "A sequence ontology name must be provided") ontology = self._repo.getOntologyByName(ontologyName) self._checkSequenceOntology(ontology) for annotationSet in variantSet.getVariantAnnotationSets(): annotationSet.setOntology(ontology) annotationSets.append(annotationSet) # Add the annotation sets and the variant set as an atomic update def updateRepo(): self._repo.insertVariantSet(variantSet) for annotationSet in annotationSets: self._repo.insertVariantAnnotationSet(annotationSet) self._updateRepo(updateRepo)
def getDataModelInstance(self, localId, dataPath): variantSet = variants.HtslibVariantSet(self._dataset, localId) variantSet.populateFromDirectory(dataPath) referenceSet = references.AbstractReferenceSet("test") variantSet.setReferenceSet(referenceSet) return variantSet
def main(): # Set for using hg38 rather than hg19 # reference_set_path = '/mnt/ga4gh/repo_data/hg38.fa.gz' reference_set_path = '/mnt/ga4gh/repo_data/hs37d5.fa.gz' bio_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.Biosample.tsv' ind_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.individual.tsv' bio_samples = parse_file_biosamples(bio_tsv_location) individuals = parse_file_individuals(ind_tsv_location) repoPath = os.path.join("repo2.db") repo = datarepo.SqlDataRepository(repoPath) if (os.path.isfile("repo2.db") == True): os.system("rm repo2.db") repo.open("w") repo.initialise() dataset = datasets.Dataset("Simons") dataset.setDescription( "Variants from the Simons Foundation Genome Diversity Project") repo.insertDataset(dataset) print("Inserting biosamples") new_bio_samples = [] for bio_sample in bio_samples: new_bio_sample = biodata.Biosample( dataset, unicode(bio_sample['name'], errors='replace')) new_bio_sample.populateFromJson(json.dumps(bio_sample)) repo.insertBiosample(new_bio_sample) new_bio_samples.append(new_bio_sample) print("Inserting individuals") new_individuals = [] for individual in individuals: new_individual = biodata.Individual( dataset, unicode(individual['name'], errors='replace')) new_individual.populateFromJson(json.dumps(individual)) repo.insertIndividual(new_individual) new_individuals.append(new_individual) print("Adding reference set (takes a while)") reference_set = references.HtslibReferenceSet("NCBI37") reference_set.populateFromFile(reference_set_path) reference_set.setDescription("NCBI37 assembly of the human genome") reference_set.setNcbiTaxonId(9606) reference_set.setSourceUri( "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz" ) for reference in reference_set.getReferences(): reference.setNcbiTaxonId(9606) repo.insertReferenceSet(reference_set) seq_ontology = ontologies.Ontology("/mnt/ga4gh/repo_data/so-xp") ontology_file_path = '/mnt/ga4gh/repo_data/so-xp-simple.obo' seq_ontology.populateFromFile(ontology_file_path) seq_ontology._id = "so-xp" repo.insertOntology(seq_ontology) repo.addOntology(seq_ontology) vcf_directory = os.path.dirname('/mnt/ga4gh/data/vcf/') pattern = os.path.join(vcf_directory, "*.vcf.gz") for vcfFile in glob.glob(pattern): name = vcfFile.replace("/mnt/ga4gh/data/vcf/", "") name = name.replace(".annotated.nh2.variants.vcf.gz", "") print(name) variant_set = variants.HtslibVariantSet(dataset, name) variant_set.setReferenceSet(reference_set) variant_set.populateFromFile([vcfFile], [vcfFile + ".tbi"]) variant_set.checkConsistency() for call_set in variant_set.getCallSets(): for bio_sample in new_bio_samples: if bio_sample.getLocalId() == call_set.getLocalId(): call_set.setBioSampleId(bio_sample.getId()) repo.insertVariantSet(variant_set) name = name + "-annotated-nh2" print(name) variant_set2 = variants.HtslibVariantSet(dataset, name) variant_set2.setReferenceSet(reference_set) variant_set2.populateFromFile([vcfFile], [vcfFile + ".tbi"]) variant_set2.checkConsistency() repo.insertVariantSet(variant_set2) for annotation_set in variant_set2.getVariantAnnotationSets(): print(str(annotation_set) + "found") annotation_set.setOntology(seq_ontology) repo.insertVariantAnnotationSet(annotation_set) repo.commit() print("database filled!")