def _readBiosampleTable(self): for biosampleRecord in m.Biosample.select(): dataset = self.getDataset(biosampleRecord.datasetid.id) biosample = biodata.Biosample( dataset, biosampleRecord.name) biosample.populateFromRow(biosampleRecord) assert biosample.getId() == biosampleRecord.id dataset.addBiosample(biosample)
def addBiosample(self): """ Adds a new biosample into this repo """ self._openRepo() dataset = self._repo.getDatasetByName(self._args.datasetName) biosample = bio_metadata.Biosample( dataset, self._args.biosampleName) biosample.populateFromJson(self._args.biosample) self._updateRepo(self._repo.insertBiosample, biosample)
def testToProtocolElement(self): dataset = datasets.Dataset('dataset1') # Write out a valid input validBiosample = protocol.Biosample(name="test", created="2016-05-19T21:00:19Z", updated="2016-05-19T21:00:19Z") validBiosample.attributes.attr['test']. \ values.add().string_value = 'test-info' # pass through protocol creation biosample = bioMetadata.Biosample(dataset, "test") biosample.populateFromJson(protocol.toJson(validBiosample)) gaBiosample = biosample.toProtocolElement() # Verify elements exist self.assertEqual(gaBiosample.created, validBiosample.created) self.assertEqual(gaBiosample.updated, validBiosample.updated) # Invalid input invalidBiosample = '{"bad:", "json"}' biosample = bioMetadata.Individual(dataset, "test") # Should fail self.assertRaises(exceptions.InvalidJsonException, biosample.populateFromJson, invalidBiosample)
def run(self): if not os.path.exists(self.outputDirectory): os.makedirs(self.outputDirectory) self.repo.open("w") self.repo.initialise() referenceFileName = "ref_brca1.fa" inputRef = os.path.join(self.inputDirectory, referenceFileName) outputRef = os.path.join(self.outputDirectory, referenceFileName) shutil.copy(inputRef, outputRef) fastaFilePath = os.path.join(self.outputDirectory, referenceFileName + '.gz') pysam.tabix_compress(outputRef, fastaFilePath) with open(os.path.join(self.inputDirectory, "ref_brca1.json")) as refMetadataFile: refMetadata = json.load(refMetadataFile) with open(os.path.join(self.inputDirectory, "referenceset_hg37.json")) as refMetadataFile: refSetMetadata = json.load(refMetadataFile) referenceSet = references.HtslibReferenceSet( refSetMetadata['assemblyId']) referenceSet.populateFromFile(os.path.abspath(fastaFilePath)) referenceSet.setAssemblyId(refSetMetadata['assemblyId']) referenceSet.setDescription(refSetMetadata['description']) if refSetMetadata['species']: speciesJson = json.dumps(refSetMetadata['species']) referenceSet.setSpeciesFromJson(speciesJson) # needs a string referenceSet.setIsDerived(refSetMetadata['isDerived']) referenceSet.setSourceUri(refSetMetadata['sourceUri']) referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions']) for reference in referenceSet.getReferences(): if refSetMetadata['species']: speciesJsonStr = json.dumps(refMetadata['species']) reference.setSpeciesFromJson(speciesJsonStr) reference.setSourceAccessions(refMetadata['sourceAccessions']) self.repo.insertReferenceSet(referenceSet) dataset = datasets.Dataset("brca1") # Some info is set, it isn't important what dataset.setAttributes({"version": ga4gh.server.__version__}) self.repo.insertDataset(dataset) hg00096Individual = biodata.Individual(dataset, "HG00096") with open(os.path.join(self.inputDirectory, "individual_HG00096.json")) as jsonString: hg00096Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00096Individual) hg00096Biosample = biodata.Biosample(dataset, "HG00096") with open(os.path.join(self.inputDirectory, "biosample_HG00096.json")) as jsonString: hg00096Biosample.populateFromJson(jsonString.read()) hg00096Biosample.setIndividualId(hg00096Individual.getId()) self.repo.insertBiosample(hg00096Biosample) hg00099Individual = biodata.Individual(dataset, "HG00099") with open(os.path.join(self.inputDirectory, "individual_HG00099.json")) as jsonString: hg00099Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00099Individual) hg00099Biosample = biodata.Biosample(dataset, "HG00099") with open(os.path.join(self.inputDirectory, "biosample_HG00099.json")) as jsonString: hg00099Biosample.populateFromJson(jsonString.read()) hg00099Biosample.setIndividualId(hg00099Individual.getId()) self.repo.insertBiosample(hg00099Biosample) hg00101Individual = biodata.Individual(dataset, "HG00101") with open(os.path.join(self.inputDirectory, "individual_HG00101.json")) as jsonString: hg00101Individual.populateFromJson(jsonString.read()) self.repo.insertIndividual(hg00101Individual) hg00101Biosample = biodata.Biosample(dataset, "HG00101") with open(os.path.join(self.inputDirectory, "biosample_HG00101.json")) as jsonString: hg00101Biosample.populateFromJson(jsonString.read()) hg00101Biosample.setIndividualId(hg00101Individual.getId()) self.repo.insertBiosample(hg00101Biosample) readFiles = [ "brca1_HG00096.sam", "brca1_HG00099.sam", "brca1_HG00101.sam" ] for readFile in readFiles: name = readFile.split('_')[1].split('.')[0] readSrc = pysam.AlignmentFile( os.path.join(self.inputDirectory, readFile), "r") readDest = pysam.AlignmentFile(os.path.join( self.outputDirectory, name + ".bam"), "wb", header=readSrc.header) destFilePath = readDest.filename for readData in readSrc: readDest.write(readData) readDest.close() readSrc.close() pysam.index(destFilePath) readGroupSet = reads.HtslibReadGroupSet(dataset, name) readGroupSet.populateFromFile( os.path.abspath(destFilePath), os.path.abspath(destFilePath + ".bai")) readGroupSet.setReferenceSet(referenceSet) dataset.addReadGroupSet(readGroupSet) biosamples = [hg00096Biosample, hg00099Biosample, hg00101Biosample] for readGroup in readGroupSet.getReadGroups(): for biosample in biosamples: if biosample.getLocalId() == readGroup.getSampleName(): readGroup.setBiosampleId(biosample.getId()) self.repo.insertReadGroupSet(readGroupSet) ontologyMapFileName = "so-xp-simple.obo" inputOntologyMap = os.path.join(self.inputDirectory, ontologyMapFileName) outputOntologyMap = os.path.join(self.outputDirectory, ontologyMapFileName) shutil.copy(inputOntologyMap, outputOntologyMap) sequenceOntology = ontologies.Ontology("so-xp-simple") sequenceOntology.populateFromFile(os.path.abspath(outputOntologyMap)) sequenceOntology._id = "so-xp-simple" self.repo.insertOntology(sequenceOntology) self.repo.addOntology(sequenceOntology) vcfFiles = [ "brca1_1kgPhase3_variants.vcf", "brca1_WASH7P_annotation.vcf", "brca1_OR4F_annotation.vcf" ] for vcfFile in vcfFiles: self.addVariantSet(vcfFile, dataset, referenceSet, sequenceOntology, biosamples) # Sequence annotations seqAnnFile = "brca1_gencodev19.gff3" seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile) seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db") dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest) dbgen.run() gencode = sequence_annotations.Gff3DbFeatureSet(dataset, "gencodev19") gencode.setOntology(sequenceOntology) gencode.populateFromFile(os.path.abspath(seqAnnDest)) gencode.setReferenceSet(referenceSet) self.repo.insertFeatureSet(gencode) # add g2p featureSet g2pPath = os.path.join(self.inputDirectory, "cgd") # copy all files input directory to output path outputG2PPath = os.path.join(self.outputDirectory, "cgd") os.makedirs(outputG2PPath) for filename in glob.glob(os.path.join(g2pPath, '*.*')): shutil.copy(filename, outputG2PPath) featuresetG2P = g2p_featureset.PhenotypeAssociationFeatureSet( dataset, os.path.abspath(outputG2PPath)) featuresetG2P.setOntology(sequenceOntology) featuresetG2P.setReferenceSet(referenceSet) featuresetG2P.populateFromFile(os.path.abspath(outputG2PPath)) self.repo.insertFeatureSet(featuresetG2P) # add g2p phenotypeAssociationSet phenotypeAssociationSet = \ g2p_associationset.RdfPhenotypeAssociationSet( dataset, "cgd", os.path.abspath(outputG2PPath)) self.repo.insertPhenotypeAssociationSet(phenotypeAssociationSet) dataset.addFeatureSet(gencode) # RNA Quantification rnaDbName = os.path.join(self.outputDirectory, "rnaseq.db") store = rnaseq2ga.RnaSqliteStore(rnaDbName) store.createTables() rnaseq2ga.rnaseq2ga(self.inputDirectory + "/rna_brca1.tsv", rnaDbName, "rna_brca1.tsv", "rsem", featureType="transcript", readGroupSetNames="HG00096", dataset=dataset, featureSetNames="gencodev19", biosampleId=hg00096Biosample.getId()) rnaQuantificationSet = rna_quantification.SqliteRnaQuantificationSet( dataset, "rnaseq") rnaQuantificationSet.setReferenceSet(referenceSet) rnaQuantificationSet.populateFromFile(os.path.abspath(rnaDbName)) self.repo.insertRnaQuantificationSet(rnaQuantificationSet)
def __init__( self, localId, referenceSet, randomSeed=0, numVariantSets=1, numCalls=1, variantDensity=0.5, numReadGroupSets=1, numReadGroupsPerReadGroupSet=1, numAlignments=1, numFeatureSets=1, numPhenotypeAssociationSets=1, numPhenotypeAssociations=2, numRnaQuantSets=2, numExpressionLevels=2): super(SimulatedDataset, self).__init__(localId) self._description = "Simulated dataset {}".format(localId) for i in range(numPhenotypeAssociationSets): localId = "simPas{}".format(i) seed = randomSeed + i phenotypeAssociationSet = g2p.SimulatedPhenotypeAssociationSet( self, localId, seed, numPhenotypeAssociations) self.addPhenotypeAssociationSet(phenotypeAssociationSet) # TODO create a simulated Ontology # Variants for i in range(numVariantSets): localId = "simVs{}".format(i) seed = randomSeed + i variantSet = variants.SimulatedVariantSet( self, referenceSet, localId, seed, numCalls, variantDensity) callSets = variantSet.getCallSets() # Add biosamples for callSet in callSets: biosample = biodata.Biosample( self, callSet.getLocalId()) biosample2 = biodata.Biosample( self, callSet.getLocalId() + "2") individual = biodata.Individual( self, callSet.getLocalId()) biosample.setIndividualId(individual.getId()) biosample2.setIndividualId(individual.getId()) self.addIndividual(individual) self.addBiosample(biosample) self.addBiosample(biosample2) self.addVariantSet(variantSet) variantAnnotationSet = variants.SimulatedVariantAnnotationSet( variantSet, "simVas{}".format(i), seed) variantSet.addVariantAnnotationSet(variantAnnotationSet) # Reads for i in range(numReadGroupSets): localId = 'simRgs{}'.format(i) seed = randomSeed + i readGroupSet = reads.SimulatedReadGroupSet( self, localId, referenceSet, seed, numReadGroupsPerReadGroupSet, numAlignments) for rg in readGroupSet.getReadGroups(): biosample = biodata.Biosample( self, rg.getLocalId()) individual = biodata.Individual( self, rg.getLocalId()) biosample.setIndividualId(individual.getId()) rg.setBiosampleId(biosample.getId()) self.addIndividual(individual) self.addBiosample(biosample) self.addReadGroupSet(readGroupSet) # Features for i in range(numFeatureSets): localId = "simFs{}".format(i) seed = randomSeed + i featureSet = sequence_annotations.SimulatedFeatureSet( self, localId, seed) featureSet.setReferenceSet(referenceSet) self.addFeatureSet(featureSet) # RnaQuantificationSets for i in range(numRnaQuantSets): localId = 'simRqs{}'.format(i) rnaQuantSet = rnaQuantification.SimulatedRnaQuantificationSet( self, localId) rnaQuantSet.setReferenceSet(referenceSet) self.addRnaQuantificationSet(rnaQuantSet)
def main(): # Set for using hg38 rather than hg19 # reference_set_path = '/mnt/ga4gh/repo_data/hg38.fa.gz' reference_set_path = '/mnt/ga4gh/repo_data/hs37d5.fa.gz' bio_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.Biosample.tsv' ind_tsv_location = 'SGDP_metadata.279public.21signedLetter.samples.individual.tsv' bio_samples = parse_file_biosamples(bio_tsv_location) individuals = parse_file_individuals(ind_tsv_location) repoPath = os.path.join("repo2.db") repo = datarepo.SqlDataRepository(repoPath) if (os.path.isfile("repo2.db") == True): os.system("rm repo2.db") repo.open("w") repo.initialise() dataset = datasets.Dataset("Simons") dataset.setDescription( "Variants from the Simons Foundation Genome Diversity Project") repo.insertDataset(dataset) print("Inserting biosamples") new_bio_samples = [] for bio_sample in bio_samples: new_bio_sample = biodata.Biosample( dataset, unicode(bio_sample['name'], errors='replace')) new_bio_sample.populateFromJson(json.dumps(bio_sample)) repo.insertBiosample(new_bio_sample) new_bio_samples.append(new_bio_sample) print("Inserting individuals") new_individuals = [] for individual in individuals: new_individual = biodata.Individual( dataset, unicode(individual['name'], errors='replace')) new_individual.populateFromJson(json.dumps(individual)) repo.insertIndividual(new_individual) new_individuals.append(new_individual) print("Adding reference set (takes a while)") reference_set = references.HtslibReferenceSet("NCBI37") reference_set.populateFromFile(reference_set_path) reference_set.setDescription("NCBI37 assembly of the human genome") reference_set.setNcbiTaxonId(9606) reference_set.setSourceUri( "ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/technical/reference/phase2_reference_assembly_sequence/hs37d5.fa.gz" ) for reference in reference_set.getReferences(): reference.setNcbiTaxonId(9606) repo.insertReferenceSet(reference_set) seq_ontology = ontologies.Ontology("/mnt/ga4gh/repo_data/so-xp") ontology_file_path = '/mnt/ga4gh/repo_data/so-xp-simple.obo' seq_ontology.populateFromFile(ontology_file_path) seq_ontology._id = "so-xp" repo.insertOntology(seq_ontology) repo.addOntology(seq_ontology) vcf_directory = os.path.dirname('/mnt/ga4gh/data/vcf/') pattern = os.path.join(vcf_directory, "*.vcf.gz") for vcfFile in glob.glob(pattern): name = vcfFile.replace("/mnt/ga4gh/data/vcf/", "") name = name.replace(".annotated.nh2.variants.vcf.gz", "") print(name) variant_set = variants.HtslibVariantSet(dataset, name) variant_set.setReferenceSet(reference_set) variant_set.populateFromFile([vcfFile], [vcfFile + ".tbi"]) variant_set.checkConsistency() for call_set in variant_set.getCallSets(): for bio_sample in new_bio_samples: if bio_sample.getLocalId() == call_set.getLocalId(): call_set.setBioSampleId(bio_sample.getId()) repo.insertVariantSet(variant_set) name = name + "-annotated-nh2" print(name) variant_set2 = variants.HtslibVariantSet(dataset, name) variant_set2.setReferenceSet(reference_set) variant_set2.populateFromFile([vcfFile], [vcfFile + ".tbi"]) variant_set2.checkConsistency() repo.insertVariantSet(variant_set2) for annotation_set in variant_set2.getVariantAnnotationSets(): print(str(annotation_set) + "found") annotation_set.setOntology(seq_ontology) repo.insertVariantAnnotationSet(annotation_set) repo.commit() print("database filled!")