Exemplo n.º 1
0
Arquivo: rdd.py Projeto: laserson/adam
    def saveAsVcf(self,
                  filePath,
                  asSingleFile=True,
                  deferMerging=False,
                  stringency=LENIENT,
                  disableFastConcat=False):
        """
        Saves this RDD of variants to disk as VCF.

        :param str filePath: Path to save file to.
        :param bool asSingleFile: If true, saves the output as a single file
        by merging the sharded output after saving.
        :param bool deferMerging: If true, saves the output as prepped for merging
        into a single file, but does not merge.
        :param bdgenomics.adam.stringency stringency: The stringency to use
        when writing the VCF.
        :param bool disableFastConcat: If asSingleFile is true, disables the use
        of the fast concatenation engine for saving to HDFS.
        """

        self._jvmRdd.saveAsVcf(filePath,
                               asSingleFile,
                               deferMerging,
                               disableFastConcat,
                               _toJava(stringency, self.sc._jvm))
Exemplo n.º 2
0
    def loadIndexedBam(self,
                       filePath,
                       viewRegions,
                       stringency=STRICT):
        """
        Functions like loadAlignments, but uses BAM index files to look at fewer
        blocks, and only returns records within the specified ReferenceRegions.
        BAM index file required.

        :param str pathName: The path name to load indexed BAM formatted
        alignment records from. Globs/directories are supported.
        :param list<ReferenceRegion> viewRegions: List of ReferenceRegion to
        filter on.
        :param int stringency: The validation stringency to use when validating
        the BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT.

        :return Returns an AlignmentRecordRDD which wraps the RDD of alignment
        records, sequence dictionary representing contigs the alignment records
        may be aligned to, and the record group dictionary for the alignment
        records if one is available.
        :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD
        """

        # translate reference regions into jvm types
        javaRrs = [rr._toJava(self._jvm) for rr in viewRegions]
        
        adamRdd = self.__jac.loadIndexedBam(filePath,
                                            javaRrs,
                                            _toJava(stringency, self._jvm))

        return AlignmentRecordRDD(adamRdd, self._sc)
Exemplo n.º 3
0
    def loadAlignments(self, filePath, stringency=STRICT):
        """
        Load alignment records into an AlignmentRecordRDD.

        Loads path names ending in:
        * .bam/.cram/.sam as BAM/CRAM/SAM format,
        * .fa/.fasta as FASTA format,
        * .fq/.fastq as FASTQ format, and
        * .ifq as interleaved FASTQ format.
        
        If none of these match, fall back to Parquet + Avro.
        
        For FASTA, FASTQ, and interleaved FASTQ formats, compressed files are supported
        through compression codecs configured in Hadoop, which by default include .gz and .bz2,
        but can include more.

        :param str filePath: The path to load the file from.
        :param stringency: The validation stringency to apply. Defaults to STRICT.
        :return: Returns an RDD containing reads.
        :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD
        """

        adamRdd = self.__jac.loadAlignments(filePath,
                                            _toJava(stringency, self._jvm))

        return AlignmentRecordRDD(adamRdd, self._sc)
Exemplo n.º 4
0
    def loadFeatures(self, filePath, stringency=STRICT):
        """
        Load features into a FeatureDataset.

        Loads path names ending in:
        * .bed as BED6/12 format,
        * .gff3 as GFF3 format,
        * .gtf/.gff as GTF/GFF2 format,
        * .narrow[pP]eak as NarrowPeak format, and
        * .interval_list as IntervalList format.

        If none of these match, fall back to Parquet + Avro.

        For BED6/12, GFF3, GTF/GFF2, NarrowPeak, and IntervalList formats, compressed files
        are supported through compression codecs configured in Hadoop, which by default include
        .gz and .bz2, but can include more.

        :param str filePath: The path to load the file from.
        :param stringency: The validation stringency to apply. Defaults to STRICT.
        :return: Returns a genomic dataset containing features.
        :rtype: bdgenomics.adam.rdd.FeatureDataset
        """

        adamRdd = self.__jac.loadFeatures(filePath,
                                          _toJava(stringency, self._jvm))

        return FeatureDataset(adamRdd, self._sc)
Exemplo n.º 5
0
    def loadAlignments(self, filePath, stringency=STRICT):
        """
        Load alignment records into an AlignmentRecordRDD.

        Loads path names ending in:
        * .bam/.cram/.sam as BAM/CRAM/SAM format,
        * .fa/.fasta as FASTA format,
        * .fq/.fastq as FASTQ format, and
        * .ifq as interleaved FASTQ format.
        
        If none of these match, fall back to Parquet + Avro.
        
        For FASTA, FASTQ, and interleaved FASTQ formats, compressed files are supported
        through compression codecs configured in Hadoop, which by default include .gz and .bz2,
        but can include more.

        :param str filePath: The path to load the file from.
        :param stringency: The validation stringency to apply. Defaults to STRICT.
        :return: Returns an RDD containing reads.
        :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD
        """

        adamRdd = self.__jac.loadAlignments(filePath,
                                            _toJava(stringency, self._jvm))

        return AlignmentRecordRDD(adamRdd, self._sc)
Exemplo n.º 6
0
Arquivo: rdd.py Projeto: laserson/adam
    def saveAsPairedFastq(self,
                          fileName1,
                          fileName2,
                          persistLevel,
                          outputOriginalBaseQualities = False,
                          validationStringency = LENIENT):
        """
        Saves these AlignmentRecords to two FASTQ files.
        
        The files are one for the first mate in each pair, and the other for the
        second mate in the pair.

        :param str fileName1: Path at which to save a FASTQ file containing the
        first mate of each pair.
        :param str fileName2: Path at which to save a FASTQ file containing the
        second mate of each pair.
        :param bool outputOriginalBaseQualities: If true, writes out reads with
        the base qualities from the original qualities (SAM "OQ") field. If
        false, writes out reads with the base qualities from the qual field.
        Default is false.
        :param bdgenomics.adam.stringency validationStringency: If strict, throw
        an exception if any read in this RDD is not accompanied by its mate.
        :param pyspark.storagelevel.StorageLevel persistLevel: The persistance
        level to cache reads at between passes.
        """
        
        self._jvmRdd.saveAsPairedFastq(fileName1, fileName2,
                                        outputOriginalBaseQualities,
                                        _toJava(validationStringency, self.sc._jvm),
                                        persistLevel)
Exemplo n.º 7
0
    def saveAsPairedFastq(self,
                          fileName1,
                          fileName2,
                          persistLevel,
                          outputOriginalBaseQualities=False,
                          validationStringency=LENIENT):
        """
        Saves these AlignmentRecords to two FASTQ files.
        
        The files are one for the first mate in each pair, and the other for the
        second mate in the pair.

        :param str fileName1: Path at which to save a FASTQ file containing the
        first mate of each pair.
        :param str fileName2: Path at which to save a FASTQ file containing the
        second mate of each pair.
        :param bool outputOriginalBaseQualities: If true, writes out reads with
        the base qualities from the original qualities (SAM "OQ") field. If
        false, writes out reads with the base qualities from the qual field.
        Default is false.
        :param bdgenomics.adam.stringency validationStringency: If strict, throw
        an exception if any read in this RDD is not accompanied by its mate.
        :param pyspark.storagelevel.StorageLevel persistLevel: The persistance
        level to cache reads at between passes.
        """

        self._jvmRdd.saveAsPairedFastq(
            fileName1, fileName2, outputOriginalBaseQualities,
            _toJava(validationStringency, self.sc._jvm), persistLevel)
Exemplo n.º 8
0
    def loadCoverage(self, filePath,
                     stringency=STRICT):
        """
        Load features into a FeatureRDD and convert to a CoverageRDD.
        Coverage is stored in the score field of Feature.

        Loads path names ending in:
        * .bed as BED6/12 format,
        * .gff3 as GFF3 format,
        * .gtf/.gff as GTF/GFF2 format,
        * .narrow[pP]eak as NarrowPeak format, and
        * .interval_list as IntervalList format.

        If none of these match, fall back to Parquet + Avro.

        For BED6/12, GFF3, GTF/GFF2, NarrowPeak, and IntervalList formats, compressed files
        are supported through compression codecs configured in Hadoop, which by default include
        .gz and .bz2, but can include more.

        :param str filePath: The path to load coverage data from.
        :param stringency: The validation stringency to apply. Defaults to STRICT.
        :return: Returns an RDD containing coverage.
        :rtype: bdgenomics.adam.rdd.CoverageRDD
        """

        adamRdd = self.__jac.loadCoverage(filePath,
                                          _toJava(stringency, self._jvm))

        return CoverageRDD(adamRdd, self._sc)
Exemplo n.º 9
0
    def loadIndexedBam(self, filePath, viewRegions, stringency=STRICT):
        """
        Functions like loadAlignments, but uses BAM index files to look at fewer
        blocks, and only returns records within the specified ReferenceRegions.
        BAM index file required.

        :param str pathName: The path name to load indexed BAM formatted
        alignments from. Globs/directories are supported.
        :param list<ReferenceRegion> viewRegions: List of ReferenceRegion to
        filter on.
        :param int stringency: The validation stringency to use when validating
        the BAM/CRAM/SAM format header. Defaults to ValidationStringency.STRICT.

        :return Returns an AlignmentDataset which wraps the RDD of alignment
        records, sequence dictionary representing contigs the alignments
        may be aligned to, and the read group dictionary for the alignment
        records if one is available.
        :rtype: bdgenomics.adam.ds.AlignmentDataset
        """

        # translate reference regions into jvm types
        javaRrs = [rr._toJava(self._jvm) for rr in viewRegions]

        adamRdd = self.__jac.loadIndexedBam(filePath, javaRrs,
                                            _toJava(stringency, self._jvm))
        return AlignmentDataset(adamRdd, self._sc)
Exemplo n.º 10
0
Arquivo: rdd.py Projeto: aim11/adam
    def recalibrateBaseQualities(self,
                                 knownSnps,
                                 validationStringency = LENIENT):
        """
        Runs base quality score recalibration on a set of reads. Uses a table of
        known SNPs to mask true variation during the recalibration process.

        :param bdgenomics.adam.rdd.VariantRDD knownSnps: A table of known SNPs to mask valid variants.
        :param bdgenomics.adam.stringency validationStringency:
        """
        
        return AlignmentRecordRDD(self._jvmRdd.recalibrateBaseQualities(knownSnps._jvmRdd,
                                                                         _toJava(validationStringency, self.sc._jvm)))
Exemplo n.º 11
0
Arquivo: rdd.py Projeto: laserson/adam
    def recalibrateBaseQualities(self,
                                 knownSnps,
                                 validationStringency = LENIENT):
        """
        Runs base quality score recalibration on a set of reads. Uses a table of
        known SNPs to mask true variation during the recalibration process.

        :param bdgenomics.adam.rdd.VariantRDD knownSnps: A table of known SNPs to mask valid variants.
        :param bdgenomics.adam.stringency validationStringency:
        """
        
        return AlignmentRecordRDD(self._jvmRdd.recalibrateBaseQualities(knownSnps._jvmRdd,
                                                                         _toJava(validationStringency, self.sc._jvm)))
Exemplo n.º 12
0
    def loadGenotypes(self, filePath, stringency=STRICT):
        """
        Load genotypes into a GenotypeRDD.

        If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format.
        Else, fall back to Parquet + Avro.

        :param str filePath: The path to load the file from.
        :param stringency: The validation stringency to apply. Defaults to STRICT.
        :return: Returns an RDD containing genotypes.
        :rtype: bdgenomics.adam.rdd.GenotypeRDD
        """

        adamRdd = self.__jac.loadGenotypes(filePath,
                                           _toJava(stringency, self._jvm))

        return GenotypeRDD(adamRdd, self._sc)
Exemplo n.º 13
0
    def loadVariants(self, filePath, stringency=STRICT):
        """
        Load variants into a VariantDataset.

        If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format.
        Else, fall back to Parquet + Avro.

        :param str filePath: The path to load the file from.
        :param stringency: The validation stringency to apply. Defaults to STRICT.
        :return: Returns a genomic dataset containing variants.
        :rtype: bdgenomics.adam.rdd.VariantDataset
        """

        adamRdd = self.__jac.loadVariants(filePath,
                                          _toJava(stringency, self._jvm))

        return VariantDataset(adamRdd, self._sc)
Exemplo n.º 14
0
    def loadVariants(self, filePath, stringency=STRICT):
        """
        Load variants into a VariantRDD.

        If the path name has a .vcf/.vcf.gz/.vcf.bgz extension, load as VCF format.
        Else, fall back to Parquet + Avro.

        :param str filePath: The path to load the file from.
        :param stringency: The validation stringency to apply. Defaults to STRICT.
        :return: Returns an RDD containing variants.
        :rtype: bdgenomics.adam.rdd.VariantRDD
        """

        adamRdd = self.__jac.loadVariants(filePath,
                                          _toJava(stringency, self._jvm))

        return VariantRDD(adamRdd, self._sc)
Exemplo n.º 15
0
    def reassembleReadPairs(self, secondPairRdd, validationStringency=LENIENT):
        """
        Reassembles read pairs from two sets of unpaired reads.

        The assumption is that the two sets were _originally_ paired together.
        The RDD that this is called on should be the RDD with the first read
        from the pair.

        :param pyspark.rdd.RDD secondPairRdd: The rdd containing the second read
        from the pairs.
        :param bdgenomics.adam.stringency validationStringency: How stringently
        to validate the reads.
        :return: Returns an RDD with the pair information recomputed.
        :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD
        """

        return AlignmentRecordRDD(
            self._jvmRdd.reassembleReadPairs(
                rdd._jrdd, _toJava(validationStringency, self.sc._jvm)),
            self.sc)
Exemplo n.º 16
0
Arquivo: rdd.py Projeto: laserson/adam
    def reassembleReadPairs(self,
                            secondPairRdd,
                            validationStringency = LENIENT):
        """
        Reassembles read pairs from two sets of unpaired reads.

        The assumption is that the two sets were _originally_ paired together.
        The RDD that this is called on should be the RDD with the first read
        from the pair.

        :param pyspark.rdd.RDD secondPairRdd: The rdd containing the second read
        from the pairs.
        :param bdgenomics.adam.stringency validationStringency: How stringently
        to validate the reads.
        :return: Returns an RDD with the pair information recomputed.
        :rtype: bdgenomics.adam.rdd.AlignmentRecordRDD
        """
        
        return AlignmentRecordRDD(self._jvmRdd.reassembleReadPairs(rdd._jrdd,
                                                                    _toJava(validationStringency, self.sc._jvm)),
                                  self.sc)
Exemplo n.º 17
0
    def saveAsVcf(self,
                  filePath,
                  asSingleFile=True,
                  deferMerging=False,
                  stringency=LENIENT,
                  sortOnSave=None,
                  disableFastConcat=False):
        """
        Saves this RDD of variants to disk as VCF.

        :param str filePath: Path to save file to.
        :param bool asSingleFile: If true, saves the output as a single file
        by merging the sharded output after saving.
        :param bool deferMerging: If true, saves the output as prepped for merging
        into a single file, but does not merge.
        :param bdgenomics.adam.stringency stringency: The stringency to use
        when writing the VCF.
        :param bool sortOnSave: Whether to sort when saving. If None, does not
        sort. If True, sorts by contig index. If "lexicographically", sorts by
        contig name.
        :param bool disableFastConcat: If asSingleFile is true, disables the use
        of the fast concatenation engine for saving to HDFS.
        """

        vcs = self._jvmRdd.toVariantContextRDD()

        if sortOnSave is None:
            finalVcs = vcs
        elif sortOnSave == "lexicographically":
            finalVcs = vcs.sortLexicographically()
        elif sortOnSave:
            finalVcs = vcs.sort()
        else:
            raise RuntimeError(
                'sortOnSave = %s. Expected None, "lexicographically", or True.'
                % sortOnSave)

        finalVcs.saveAsVcf(filePath, asSingleFile,
                           deferMerging, disableFastConcat,
                           _toJava(stringency, self.sc._jvm))
Exemplo n.º 18
0
    def saveAsFastq(self,
                    fileName,
                    validationStringency=LENIENT,
                    sort=False,
                    outputOriginalBaseQualities=False):
        """
        Saves reads in FASTQ format.

        :param str fileName: Path to save files at.
        :param bdgenomics.adam.stringency validationStringency: If strict, throw
        an exception if any read in this RDD is not accompanied by its mate.
        :param bool sort: Whether to sort the FASTQ files by read name or not.
        Defaults to false. Sorting the output will recover pair order, if
        desired.
        :param bool outputOriginalBaseQualities: If true, writes out reads with
        the base qualities from the original qualities (SAM "OQ") field. If
        false, writes out reads with the base qualities from the qual field.
        Default is false.
        """

        self._jvmRdd.saveAsFastq(fileName, outputOriginalBaseQualities, sort,
                                 _toJava(validationStringency, self.sc._jvm))
Exemplo n.º 19
0
    def saveAsVcf(self,
                  filePath,
                  asSingleFile=True,
                  deferMerging=False,
                  stringency=LENIENT,
                  disableFastConcat=False):
        """
        Saves this RDD of variants to disk as VCF.

        :param str filePath: Path to save file to.
        :param bool asSingleFile: If true, saves the output as a single file
        by merging the sharded output after saving.
        :param bool deferMerging: If true, saves the output as prepped for merging
        into a single file, but does not merge.
        :param bdgenomics.adam.stringency stringency: The stringency to use
        when writing the VCF.
        :param bool disableFastConcat: If asSingleFile is true, disables the use
        of the fast concatenation engine for saving to HDFS.
        """

        self._jvmRdd.saveAsVcf(filePath, asSingleFile, deferMerging,
                               disableFastConcat,
                               _toJava(stringency, self.sc._jvm))
Exemplo n.º 20
0
Arquivo: rdd.py Projeto: laserson/adam
    def saveAsFastq(self,
                    fileName,
                    validationStringency = LENIENT,
                    sort = False,
                    outputOriginalBaseQualities = False):
        """
        Saves reads in FASTQ format.

        :param str fileName: Path to save files at.
        :param bdgenomics.adam.stringency validationStringency: If strict, throw
        an exception if any read in this RDD is not accompanied by its mate.
        :param bool sort: Whether to sort the FASTQ files by read name or not.
        Defaults to false. Sorting the output will recover pair order, if
        desired.
        :param bool outputOriginalBaseQualities: If true, writes out reads with
        the base qualities from the original qualities (SAM "OQ") field. If
        false, writes out reads with the base qualities from the qual field.
        Default is false.
        """
        
        self._jvmRdd.saveAsFastq(fileName,
                                  outputOriginalBaseQualities,
                                  sort,
                                  _toJava(validationStringency, self.sc._jvm))