Пример #1
0
 def getDataModelInstance(self, localId, dataPath):
     featureSet = sequenceAnnotations.Gff3DbFeatureSet(
         self._dataset, localId)
     featureSet.setOntology(self._ontology)
     featureSet.setReferenceSet(self._referenceSet)
     featureSet.populateFromFile(dataPath)
     return featureSet
Пример #2
0
 def _readFeatureSetTable(self, cursor):
     cursor.row_factory = sqlite3.Row
     cursor.execute("SELECT * FROM FeatureSet;")
     for row in cursor:
         dataset = self.getDataset(row[b'datasetId'])
         featureSet = sequenceAnnotations.Gff3DbFeatureSet(
             dataset, row[b'name'])
         featureSet.setReferenceSet(
             self.getReferenceSet(row[b'referenceSetId']))
         featureSet.setOntology(self.getOntology(row[b'ontologyId']))
         featureSet.populateFromRow(row)
         assert featureSet.getId() == row[b'id']
         dataset.addFeatureSet(featureSet)
Пример #3
0
    def __init__(self, localId, dataDir, dataRepository):
        super(FileSystemDataset, self).__init__(localId)
        self._dataDir = dataDir
        self._setMetadata()

        # Variants
        variantSetDir = os.path.join(dataDir, self.variantsDirName)
        for localId in os.listdir(variantSetDir):
            relativePath = os.path.join(variantSetDir, localId)
            if os.path.isdir(relativePath):
                variantSet = variants.HtslibVariantSet(self, localId,
                                                       relativePath,
                                                       dataRepository)
                self.addVariantSet(variantSet)
                # Variant annotations sets
                if variantSet.isAnnotated(relativePath):
                    variantAnnotationSet = variants.HtslibVariantAnnotationSet(
                        self, localId, relativePath, dataRepository,
                        variantSet)
                    self.addVariantAnnotationSet(variantAnnotationSet)

        # Reads
        readGroupSetDir = os.path.join(dataDir, self.readsDirName)
        for filename in os.listdir(readGroupSetDir):
            if fnmatch.fnmatch(filename, '*.bam'):
                localId, _ = os.path.splitext(filename)
                bamPath = os.path.join(readGroupSetDir, filename)
                readGroupSet = reads.HtslibReadGroupSet(
                    self, localId, bamPath, dataRepository)
                self.addReadGroupSet(readGroupSet)
        # Sequence Annotations
        featureSetDir = os.path.join(dataDir, self.featuresDirName)
        for filename in os.listdir(featureSetDir):
            if fnmatch.fnmatch(filename, '*.db'):
                localId, _ = os.path.splitext(filename)
                fullPath = os.path.join(featureSetDir, filename)
                featureSet = sequenceAnnotations.Gff3DbFeatureSet(
                    self, localId, fullPath, dataRepository)
                self.addFeatureSet(featureSet)
    def run(self):
        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)
        self.repo.open("w")
        self.repo.initialise()

        referenceFileName = "ref_brca1.fa"
        inputRef = os.path.join(
            self.inputDirectory, referenceFileName)
        outputRef = os.path.join(
            self.outputDirectory, referenceFileName)
        shutil.copy(inputRef, outputRef)
        fastaFilePath = os.path.join(
            self.outputDirectory,
            referenceFileName + '.gz')
        pysam.tabix_compress(
            outputRef, fastaFilePath)

        with open(
                os.path.join(
                    self.inputDirectory, "ref_brca1.json")) as refMetadataFile:
            refMetadata = json.load(refMetadataFile)
        with open(
                os.path.join(
                    self.inputDirectory,
                    "referenceset_hg37.json")) as refMetadataFile:
            refSetMetadata = json.load(refMetadataFile)

        referenceSet = references.HtslibReferenceSet(
            refSetMetadata['assemblyId'])

        referenceSet.populateFromFile(fastaFilePath)
        referenceSet.setAssemblyId(refSetMetadata['assemblyId'])
        referenceSet.setDescription(refSetMetadata['description'])
        referenceSet.setNcbiTaxonId(refSetMetadata['ncbiTaxonId'])
        referenceSet.setIsDerived(refSetMetadata['isDerived'])
        referenceSet.setSourceUri(refSetMetadata['sourceUri'])
        referenceSet.setSourceAccessions(refSetMetadata['sourceAccessions'])
        for reference in referenceSet.getReferences():
            reference.setNcbiTaxonId(refMetadata['ncbiTaxonId'])
            reference.setSourceAccessions(
                refMetadata['sourceAccessions'])
        self.repo.insertReferenceSet(referenceSet)

        dataset = datasets.Dataset("brca1")
        self.repo.insertDataset(dataset)

        hg00096Individual = biodata.Individual(dataset, "HG00096")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00096.json")) as jsonString:
            hg00096Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00096Individual)
        hg00096BioSample = biodata.BioSample(dataset, "HG00096")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00096.json")) as jsonString:
            hg00096BioSample.populateFromJson(jsonString.read())
        hg00096BioSample.setIndividualId(hg00096Individual.getId())
        self.repo.insertBioSample(hg00096BioSample)
        hg00099Individual = biodata.Individual(dataset, "HG00099")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00099.json")) as jsonString:
            hg00099Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00099Individual)
        hg00099BioSample = biodata.BioSample(dataset, "HG00099")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00099.json")) as jsonString:
            hg00099BioSample.populateFromJson(jsonString.read())
        hg00099BioSample.setIndividualId(hg00099Individual.getId())
        self.repo.insertBioSample(hg00099BioSample)
        hg00101Individual = biodata.Individual(dataset, "HG00101")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "individual_HG00101.json")) as jsonString:
            hg00101Individual.populateFromJson(jsonString.read())
        self.repo.insertIndividual(hg00101Individual)
        hg00101BioSample = biodata.BioSample(dataset, "HG00101")
        with open(
                os.path.join(
                    self.inputDirectory,
                    "bioSample_HG00101.json")) as jsonString:
            hg00101BioSample.populateFromJson(jsonString.read())
        hg00101BioSample.setIndividualId(hg00101Individual.getId())
        self.repo.insertBioSample(hg00101BioSample)

        readFiles = [
            "brca1_HG00096.sam",
            "brca1_HG00099.sam",
            "brca1_HG00101.sam"]

        for readFile in readFiles:
            name = readFile.split('_')[1].split('.')[0]
            readSrc = pysam.AlignmentFile(
                os.path.join(self.inputDirectory, readFile), "r")
            readDest = pysam.AlignmentFile(
                os.path.join(
                    self.outputDirectory,
                    name + ".bam"),
                "wb", header=readSrc.header)
            destFilePath = readDest.filename
            for readData in readSrc:
                readDest.write(readData)
            readDest.close()
            readSrc.close()
            pysam.index(destFilePath)
            readGroupSet = reads.HtslibReadGroupSet(dataset, name)
            readGroupSet.populateFromFile(destFilePath, destFilePath + ".bai")
            readGroupSet.setReferenceSet(referenceSet)
            bioSamples = [hg00096BioSample, hg00099BioSample, hg00101BioSample]
            for readGroup in readGroupSet.getReadGroups():
                for bioSample in bioSamples:
                    if bioSample.getLocalId() == readGroup.getSampleName():
                        readGroup.setBioSampleId(bioSample.getId())
            self.repo.insertReadGroupSet(readGroupSet)

        ontologyMapFileName = "so-xp-simple.obo"
        inputOntologyMap = os.path.join(
            self.inputDirectory, ontologyMapFileName)
        outputOntologyMap = os.path.join(
            self.outputDirectory, ontologyMapFileName)
        shutil.copy(inputOntologyMap, outputOntologyMap)

        sequenceOntology = ontologies.Ontology("so-xp-simple")
        sequenceOntology.populateFromFile(outputOntologyMap)
        sequenceOntology._id = "so-xp-simple"
        self.repo.insertOntology(sequenceOntology)
        self.repo.addOntology(sequenceOntology)

        vcfFiles = [
            "brca1_1kgPhase3_variants.vcf",
            "brca1_WASH7P_annotation.vcf",
            "brca1_OR4F_annotation.vcf"]
        for vcfFile in vcfFiles:
            self.addVariantSet(
                vcfFile,
                dataset,
                referenceSet,
                sequenceOntology,
                bioSamples)

        seqAnnFile = "brca1_gencodev19.gff3"
        seqAnnSrc = os.path.join(self.inputDirectory, seqAnnFile)
        seqAnnDest = os.path.join(self.outputDirectory, "gencodev19.db")
        dbgen = generate_gff3_db.Gff32Db(seqAnnSrc, seqAnnDest)
        dbgen.run()
        gencode = sequenceAnnotations.Gff3DbFeatureSet(dataset, "gencodev19")
        gencode.setOntology(sequenceOntology)
        gencode.populateFromFile(seqAnnDest)
        gencode.setReferenceSet(referenceSet)

        self.repo.insertFeatureSet(gencode)

        self.repo.commit()

        print("Done converting compliance data.", file=sys.stderr)
Пример #5
0
 def getDataModelInstance(self, localId, dataPath):
     return sequenceAnnotations.Gff3DbFeatureSet(
         self._dataset, localId, dataPath, self._datarepo)