예제 #1
0
파일: backend.py 프로젝트: pcingola/server
    def __init__(self, dataDir):
        super(FileSystemBackend, self).__init__()
        self._dataDir = dataDir
        # TODO this code is very ugly and should be regarded as a temporary
        # stop-gap until we deal with iterating over the data tree properly.

        # References
        referencesDirName = "references"
        referenceSetDir = os.path.join(self._dataDir, referencesDirName)
        for referenceSetName in os.listdir(referenceSetDir):
            relativePath = os.path.join(referenceSetDir, referenceSetName)
            if os.path.isdir(relativePath):
                referenceSet = references.HtslibReferenceSet(
                    referenceSetName, relativePath, self)
                self.addReferenceSet(referenceSet)
        # Datasets
        datasetDirs = [
            os.path.join(self._dataDir, directory)
            for directory in os.listdir(self._dataDir)
            if os.path.isdir(os.path.join(self._dataDir, directory))
            and directory != referencesDirName
        ]
        for datasetDir in datasetDirs:
            dataset = datasets.FileSystemDataset(datasetDir, self)
            self.addDataset(dataset)
예제 #2
0
    def __init__(self, dataDir):
        super(FileSystemBackend, self).__init__()
        self._dataDir = dataDir
        # TODO this code is very ugly and should be regarded as a temporary
        # stop-gap until we deal with iterating over the data tree properly.

        # References
        referencesDirName = "references"
        referenceSetDir = os.path.join(self._dataDir, referencesDirName)
        for referenceSetId in os.listdir(referenceSetDir):
            relativePath = os.path.join(referenceSetDir, referenceSetId)
            if os.path.isdir(relativePath):
                referenceSet = references.HtslibReferenceSet(
                    referenceSetId, relativePath)
                self._referenceSetIdMap[referenceSetId] = referenceSet
                for reference in referenceSet.getReferences():
                    referenceId = reference.getId()
                    self._referenceIdMap[referenceId] = reference
        self._referenceSetIds = sorted(self._referenceSetIdMap.keys())
        self._referenceIds = sorted(self._referenceIdMap.keys())

        # Datasets
        datasetDirs = [
            os.path.join(self._dataDir, directory)
            for directory in os.listdir(self._dataDir)
            if os.path.isdir(os.path.join(self._dataDir, directory)) and
            directory != referencesDirName]
        for datasetDir in datasetDirs:
            dataset = datasets.FileSystemDataset(datasetDir)
            self._datasetIdMap[dataset.getId()] = dataset
        self._datasetIds = sorted(self._datasetIdMap.keys())
예제 #3
0
 def _readReferenceSetTable(self, cursor):
     cursor.row_factory = sqlite3.Row
     cursor.execute("SELECT * FROM ReferenceSet;")
     for row in cursor:
         referenceSet = references.HtslibReferenceSet(row[b'name'])
         referenceSet.populateFromRow(row)
         assert referenceSet.getId() == row[b"id"]
         # Insert the referenceSet into the memory-based object model.
         self.addReferenceSet(referenceSet)
예제 #4
0
    def createRepo(self):
        """
        Creates the repository for all the data we've just downloaded.
        """
        repo = datarepo.SqlDataRepository(self.repoPath)
        repo.open("w")
        repo.initialise()

        referenceSet = references.HtslibReferenceSet("GRCh37-subset")
        referenceSet.populateFromFile(self.fastaFilePath)
        referenceSet.setDescription("Subset of GRCh37 used for demonstration")
        referenceSet.setNcbiTaxonId(9606)
        for reference in referenceSet.getReferences():
            reference.setNcbiTaxonId(9606)
            reference.setSourceAccessions(
                self.accessions[reference.getName()] + ".subset")
        repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("1kg-p3-subset")
        dataset.setDescription("Sample data from 1000 Genomes phase 3")
        repo.insertDataset(dataset)

        variantSet = variants.HtslibVariantSet(dataset, "mvncall")
        variantSet.setReferenceSet(referenceSet)
        dataUrls = [vcfFile for vcfFile, _ in self.vcfFilePaths]
        indexFiles = [indexFile for _, indexFile in self.vcfFilePaths]
        variantSet.populateFromFile(dataUrls, indexFiles)
        variantSet.checkConsistency()
        repo.insertVariantSet(variantSet)

        for sample, (bamFile, indexFile) in zip(self.samples,
                                                self.bamFilePaths):
            readGroupSet = reads.HtslibReadGroupSet(dataset, sample)
            readGroupSet.populateFromFile(bamFile, indexFile)
            readGroupSet.setReferenceSet(referenceSet)
            repo.insertReadGroupSet(readGroupSet)

        repo.commit()
        repo.close()
        self.log("Finished creating the repository; summary:\n")
        repo.open("r")
        repo.printSummary()
예제 #5
0
 def addReferenceSet(self):
     """
     Adds a new reference set into this repo.
     """
     self._openRepo()
     name = self._args.name
     filePath = self._getFilePath(self._args.filePath,
                                  self._args.relativePath)
     if name is None:
         name = getNameFromPath(self._args.filePath)
     referenceSet = references.HtslibReferenceSet(name)
     referenceSet.populateFromFile(filePath)
     referenceSet.setDescription(self._args.description)
     referenceSet.setNcbiTaxonId(self._args.ncbiTaxonId)
     referenceSet.setIsDerived(self._args.isDerived)
     referenceSet.setAssemblyId(self._args.assemblyId)
     sourceAccessions = []
     if self._args.sourceAccessions is not None:
         sourceAccessions = self._args.sourceAccessions.split(",")
     referenceSet.setSourceAccessions(sourceAccessions)
     referenceSet.setSourceUri(self._args.sourceUri)
     self._updateRepo(self._repo.insertReferenceSet, referenceSet)
예제 #6
0
 def testMissingReferenceSetMetadata(self):
     localId = "invalid_refset_meta"
     path = self.getFullPath(localId)
     with self.assertRaises(ValueError):
         references.HtslibReferenceSet(localId, path, None)
예제 #7
0
 def testMissingReferenceSetMetadata(self):
     localId = "missing_refset_meta"
     path = self.getFullPath(localId)
     with self.assertRaises(exceptions.MissingReferenceSetMetadata):
         references.HtslibReferenceSet(localId, path, None)
예제 #8
0
 def testNoReferenceSetMetadata(self):
     localId = "no_refset_meta"
     path = self.getFullPath(localId)
     with self.assertRaises(IOError):
         references.HtslibReferenceSet(localId, path, None)
    def run(self):
        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)
        self.repo.open("w")
        self.repo.initialise()

        referenceFileName = "ref_brca1.fa"
        inputRef = os.path.join(
            self.inputDirectory, referenceFileName)
        outputRef = os.path.join(
            self.outputDirectory, referenceFileName)
        shutil.copy(inputRef, outputRef)
        fastaFilePath = os.path.join(
            self.outputDirectory,
            referenceFileName + '.gz')
        pysam.tabix_compress(
            outputRef, fastaFilePath)

        with open(
                os.path.join(
                    self.inputDirectory, "ref_brca1.json")) as refMetadataFile:
            refMetadata = json.load(refMetadataFile)
        with open(
                os.path.join(
                    self.inputDirectory,
                    "referenceset_hg37.json")) as refMetadataFile:
            refSetMetadata = json.load(refMetadataFile)

        referenceSet = references.HtslibReferenceSet(
            refSetMetadata['assemblyId'])

        referenceSet.populateFromFile(fastaFilePath)
        referenceSet.setAssemblyId(refSetMetadata['assemblyId'])
        referenceSet.setDescription(refSetMetadata['description'])
        referenceSet.setNcbiTaxonId(refSetMetadata['ncbiTaxonId'])
        referenceSet.setIsDerived(refSetMetadata['isDerived'])
        referenceSet.setSourceUri(refSetMetadata['sourceUri'])
        referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions'])
        for reference in referenceSet.getReferences():
            reference.setNcbiTaxonId(refMetadata['ncbiTaxonId'])
            reference.setSourceAccessions(
                refMetadata['sourceAccessions'])
        self.repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("brca1")
        self.repo.insertDataset(dataset)

        hg00096Individual = biodata.Individual(dataset, "HG00096")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00096.json")) as jsonString:
            hg00096Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00096Individual)
        hg00096BioSample = biodata.BioSample(dataset, "HG00096")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00096.json")) as jsonString:
            hg00096BioSample.populateFromJson(jsonString.read())
        hg00096BioSample.setIndividualId(hg00096Individual.getId())
        self.repo.insertBioSample(hg00096BioSample)
        hg00099Individual = biodata.Individual(dataset, "HG00099")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00099.json")) as jsonString:
            hg00099Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00099Individual)
        hg00099BioSample = biodata.BioSample(dataset, "HG00099")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00099.json")) as jsonString:
            hg00099BioSample.populateFromJson(jsonString.read())
        hg00099BioSample.setIndividualId(hg00099Individual.getId())
        self.repo.insertBioSample(hg00099BioSample)
        hg00101Individual = biodata.Individual(dataset, "HG00101")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00101.json")) as jsonString:
            hg00101Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00101Individual)
        hg00101BioSample = biodata.BioSample(dataset, "HG00101")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00101.json")) as jsonString:
            hg00101BioSample.populateFromJson(jsonString.read())
        hg00101BioSample.setIndividualId(hg00101Individual.getId())
        self.repo.insertBioSample(hg00101BioSample)

        readFiles = [
            "brca1_HG00096.sam",
            "brca1_HG00099.sam",
            "brca1_HG00101.sam"]

        for readFile in readFiles:
            name = readFile.split('_')[1].split('.')[0]
            readSrc = pysam.AlignmentFile(
                os.path.join(self.inputDirectory, readFile), "r")
            readDest = pysam.AlignmentFile(
                os.path.join(
                    self.outputDirectory,
                    name + ".bam"),
                "wb", header=readSrc.header)
            destFilePath = readDest.filename
            for readData in readSrc:
                readDest.write(readData)
            readDest.close()
            readSrc.close()
            pysam.index(destFilePath)
            readGroupSet = reads.HtslibReadGroupSet(dataset, name)
            readGroupSet.populateFromFile(destFilePath, destFilePath + ".bai")
            readGroupSet.setReferenceSet(referenceSet)
            bioSamples = [hg00096BioSample, hg00099BioSample, hg00101BioSample]
            for readGroup in readGroupSet.getReadGroups():
                for bioSample in bioSamples:
                    if bioSample.getLocalId() == readGroup.getSampleName():
                        readGroup.setBioSampleId(bioSample.getId())
            self.repo.insertReadGroupSet(readGroupSet)

        ontologyMapFileName = "so-xp-simple.obo"
        inputOntologyMap = os.path.join(
            self.inputDirectory, ontologyMapFileName)
        outputOntologyMap = os.path.join(
            self.outputDirectory, ontologyMapFileName)
        shutil.copy(inputOntologyMap, outputOntologyMap)

        sequenceOntology = ontologies.Ontology("so-xp-simple")
        sequenceOntology.populateFromFile(outputOntologyMap)
        sequenceOntology._id = "so-xp-simple"
        self.repo.insertOntology(sequenceOntology)
        self.repo.addOntology(sequenceOntology)

        vcfFiles = [
            "brca1_1kgPhase3_variants.vcf",
            "brca1_WASH7P_annotation.vcf",
            "brca1_OR4F_annotation.vcf"]
        for vcfFile in vcfFiles:
            self.addVariantSet(
                vcfFile,
                dataset,
                referenceSet,
                sequenceOntology,
                bioSamples)

        seqAnnFile = "brca1_gencodev19.gff3"
        seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile)
        seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db")
        dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest)
        dbgen.run()
        gencode = sequenceAnnotations.Gff3DbFeatureSet(dataset, "gencodev19")
        gencode.setOntology(sequenceOntology)
        gencode.populateFromFile(seqAnnDest)
        gencode.setReferenceSet(referenceSet)

        self.repo.insertFeatureSet(gencode)

        self.repo.commit()

        print("Done converting compliance data.", file=sys.stderr)
예제 #10
0
 def testInstantiation(self):
     for referenceSetId in self.setIds:
         path = self.getFullPath(referenceSetId)
         with self.assertRaises(exceptions.NotExactlyOneReferenceException):
             references.HtslibReferenceSet(referenceSetId, path)
예제 #11
0
 def getDataModelInstance(self, localId, dataPath):
     return references.HtslibReferenceSet(localId, dataPath, None)
예제 #12
0
    def run(self):
        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)
        self.repo.open("w")
        self.repo.initialise()

        referenceFileName = "ref_brca1.fa"
        inputRef = os.path.join(self.inputDirectory, referenceFileName)
        outputRef = os.path.join(self.outputDirectory, referenceFileName)
        shutil.copy(inputRef, outputRef)
        fastaFilePath = os.path.join(self.outputDirectory,
                                     referenceFileName + '.gz')
        pysam.tabix_compress(outputRef, fastaFilePath)

        with open(os.path.join(self.inputDirectory,
                               "ref_brca1.json")) as refMetadataFile:
            refMetadata = json.load(refMetadataFile)
        with open(os.path.join(self.inputDirectory,
                               "referenceset_hg37.json")) as refMetadataFile:
            refSetMetadata = json.load(refMetadataFile)

        referenceSet = references.HtslibReferenceSet(
            refSetMetadata['assemblyId'])

        referenceSet.populateFromFile(fastaFilePath)
        referenceSet.setAssemblyId(refSetMetadata['assemblyId'])
        referenceSet.setDescription(refSetMetadata['description'])
        referenceSet.setNcbiTaxonId(refSetMetadata['ncbiTaxonId'])
        referenceSet.setIsDerived(refSetMetadata['isDerived'])
        referenceSet.setSourceUri(refSetMetadata['sourceUri'])
        referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions'])
        for reference in referenceSet.getReferences():
            reference.setNcbiTaxonId(refMetadata['ncbiTaxonId'])
            reference.setSourceAccessions(refMetadata['sourceAccessions'])
        self.repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("brca1")
        # Some info is set, it isn't important what
        dataset.setInfo({"version": ga4gh.__version__})
        self.repo.insertDataset(dataset)

        hg00096Individual = biodata.Individual(dataset, "HG00096")
        with open(os.path.join(self.inputDirectory,
                               "individual_HG00096.json")) as jsonString:
            hg00096Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00096Individual)
        hg00096BioSample = biodata.BioSample(dataset, "HG00096")
        with open(os.path.join(self.inputDirectory,
                               "bioSample_HG00096.json")) as jsonString:
            hg00096BioSample.populateFromJson(jsonString.read())
        hg00096BioSample.setIndividualId(hg00096Individual.getId())
        self.repo.insertBioSample(hg00096BioSample)
        hg00099Individual = biodata.Individual(dataset, "HG00099")
        with open(os.path.join(self.inputDirectory,
                               "individual_HG00099.json")) as jsonString:
            hg00099Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00099Individual)
        hg00099BioSample = biodata.BioSample(dataset, "HG00099")
        with open(os.path.join(self.inputDirectory,
                               "bioSample_HG00099.json")) as jsonString:
            hg00099BioSample.populateFromJson(jsonString.read())
        hg00099BioSample.setIndividualId(hg00099Individual.getId())
        self.repo.insertBioSample(hg00099BioSample)
        hg00101Individual = biodata.Individual(dataset, "HG00101")
        with open(os.path.join(self.inputDirectory,
                               "individual_HG00101.json")) as jsonString:
            hg00101Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00101Individual)
        hg00101BioSample = biodata.BioSample(dataset, "HG00101")
        with open(os.path.join(self.inputDirectory,
                               "bioSample_HG00101.json")) as jsonString:
            hg00101BioSample.populateFromJson(jsonString.read())
        hg00101BioSample.setIndividualId(hg00101Individual.getId())
        self.repo.insertBioSample(hg00101BioSample)

        readFiles = [
            "brca1_HG00096.sam", "brca1_HG00099.sam", "brca1_HG00101.sam"
        ]

        for readFile in readFiles:
            name = readFile.split('_')[1].split('.')[0]
            readSrc = pysam.AlignmentFile(
                os.path.join(self.inputDirectory, readFile), "r")
            readDest = pysam.AlignmentFile(os.path.join(
                self.outputDirectory, name + ".bam"),
                                           "wb",
                                           header=readSrc.header)
            destFilePath = readDest.filename
            for readData in readSrc:
                readDest.write(readData)
            readDest.close()
            readSrc.close()
            pysam.index(destFilePath)
            readGroupSet = reads.HtslibReadGroupSet(dataset, name)
            readGroupSet.populateFromFile(destFilePath, destFilePath + ".bai")
            readGroupSet.setReferenceSet(referenceSet)
            dataset.addReadGroupSet(readGroupSet)
            bioSamples = [hg00096BioSample, hg00099BioSample, hg00101BioSample]
            for readGroup in readGroupSet.getReadGroups():
                for bioSample in bioSamples:
                    if bioSample.getLocalId() == readGroup.getSampleName():
                        readGroup.setBioSampleId(bioSample.getId())
            self.repo.insertReadGroupSet(readGroupSet)

        ontologyMapFileName = "so-xp-simple.obo"
        inputOntologyMap = os.path.join(self.inputDirectory,
                                        ontologyMapFileName)
        outputOntologyMap = os.path.join(self.outputDirectory,
                                         ontologyMapFileName)
        shutil.copy(inputOntologyMap, outputOntologyMap)

        sequenceOntology = ontologies.Ontology("so-xp-simple")
        sequenceOntology.populateFromFile(outputOntologyMap)
        sequenceOntology._id = "so-xp-simple"
        self.repo.insertOntology(sequenceOntology)
        self.repo.addOntology(sequenceOntology)

        vcfFiles = [
            "brca1_1kgPhase3_variants.vcf", "brca1_WASH7P_annotation.vcf",
            "brca1_OR4F_annotation.vcf"
        ]
        for vcfFile in vcfFiles:
            self.addVariantSet(vcfFile, dataset, referenceSet,
                               sequenceOntology, bioSamples)

        # Sequence annotations
        seqAnnFile = "brca1_gencodev19.gff3"
        seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile)
        seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db")
        dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest)
        dbgen.run()
        gencode = sequence_annotations.Gff3DbFeatureSet(dataset, "gencodev19")
        gencode.setOntology(sequenceOntology)
        gencode.populateFromFile(seqAnnDest)
        gencode.setReferenceSet(referenceSet)

        self.repo.insertFeatureSet(gencode)

        # add g2p featureSet
        g2pPath = os.path.join(self.inputDirectory, "cgd")
        # copy all files input directory to output path
        outputG2PPath = os.path.join(self.outputDirectory, "cgd")
        os.makedirs(outputG2PPath)
        for filename in glob.glob(os.path.join(g2pPath, '*.*')):
            shutil.copy(filename, outputG2PPath)

        featuresetG2P = g2p_featureset.PhenotypeAssociationFeatureSet(
            dataset, outputG2PPath)
        featuresetG2P.setOntology(sequenceOntology)
        featuresetG2P.setReferenceSet(referenceSet)
        featuresetG2P.populateFromFile(outputG2PPath)
        self.repo.insertFeatureSet(featuresetG2P)

        # add g2p phenotypeAssociationSet
        phenotypeAssociationSet = g2p_associationset\
            .RdfPhenotypeAssociationSet(dataset, "cgd", outputG2PPath)
        self.repo.insertPhenotypeAssociationSet(phenotypeAssociationSet)

        self.repo.commit()
        dataset.addFeatureSet(gencode)

        # RNA Quantification
        rnaDbName = os.path.join(self.outputDirectory, "rnaseq.db")
        store = rnaseq2ga.RnaSqliteStore(rnaDbName)
        store.createTables()
        rnaseq2ga.rnaseq2ga(self.inputDirectory + "/rna_brca1.tsv",
                            rnaDbName,
                            "rna_brca1.tsv",
                            "rsem",
                            featureType="transcript",
                            readGroupSetNames="HG00096",
                            featureSetNames="gencodev19",
                            dataset=dataset)
        rnaQuantificationSet = rna_quantification.SqliteRnaQuantificationSet(
            dataset, "rnaseq")
        rnaQuantificationSet.setReferenceSet(referenceSet)
        rnaQuantificationSet.populateFromFile(rnaDbName)
        self.repo.insertRnaQuantificationSet(rnaQuantificationSet)

        self.repo.commit()
예제 #13
0
 def getDataModelInstance(self, localId, dataPath):
     referenceSet = references.HtslibReferenceSet(localId)
     referenceSet.populateFromFile(dataPath)
     return referenceSet