def test_save_as_bam(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) tmpPath = self.tmpFile() + ".bam" reads.saveAsSam(tmpPath, isSorted=True, asSingleFile=True) bamReads = ac.loadAlignments(tmpPath) self.assertEqual(bamReads._jvmRdd.jrdd().count(), reads._jvmRdd.jrdd().count())
def test_cumulative_count_distribution(self): # load file ac = ADAMContext(self.ss) testFile = self.resourceFile("small.sam") # read alignments reads = ac.loadAlignments(testFile) # convert to coverage coverage = reads.toCoverage() qc = CoverageDistribution(self.ss, coverage) _, cd = qc.plotDistributions(testMode = True, cumulative = True, normalize = False) # first sample items = list(cd.popitem()[1]) assert(len(items) == 1) assert(items.pop()[1] == 1500) _, cd = qc.plotDistributions(testMode = True, cumulative = False, normalize = False) # first sample items = list(cd.popitem()[1]) assert(len(items) == 1) assert(items.pop()[1] == 1500)
def test_aggregatedCoverage(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.aggregatedCoverage(10) self.assertEquals(collapsed.toDF().count(), 166)
def test_collapse(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.collapse() self.assertEquals(collapsed.toDF().count(), coverage.toDF().count())
def test_collapse(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.collapse() self.assertEquals(collapsed.toDF().count(), coverage.toDF().count())
def test_aggregatedCoverage(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() collapsed = coverage.aggregatedCoverage(10) self.assertEquals(collapsed.toDF().count(), 166)
def test_flatten(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() flattened = coverage.flatten() self.assertEquals(flattened.toDF().count(), 1500)
def test_flatten(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() flattened = coverage.flatten() self.assertEquals(flattened.toDF().count(), 1500)
def test_load_alignments(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) self.assertEqual(reads.toDF().count(), 20) self.assertEqual(reads._jvmRdd.jrdd().count(), 20)
def test_count_kmers(self): testFile = self.resourceFile("small.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) kmers = reads.countKmers(6) self.assertEqual(kmers.count(), 1040)
def test_to_fragments(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) fragments = reads.toFragments() self.assertEqual(fragments.toDF().count(), 5)
def test_save_unordered_sam(self): testFile = self.resourceFile("unordered.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) tmpPath = self.tmpFile() + ".sam" reads.saveAsSam(tmpPath, asSingleFile=True) self.checkFiles(testFile, tmpPath)
def test_save(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() tmpPath = self.tmpFile() + ".coverage.adam" coverage.save(tmpPath, asSingleFile=True, disableFastConcat=True) assert (os.listdir(tmpPath) != [])
def test_realignIndels_reads(self): readsPath = self.resourceFile("small.1.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) realigned = reads.realignIndels() self.assertEqual(realigned.toDF().count(), 20)
def test_toFeatures(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() features = coverage.toFeatures() assert (isinstance(features, FeatureRDD)) self.assertEquals(features.toDF().count(), coverage.toDF().count())
def test_toFeatures(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() features = coverage.toFeatures() assert(isinstance(features, FeatureRDD)) self.assertEquals(features.toDF().count(), coverage.toDF().count())
def test_transform(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) transformedReads = reads.transform(lambda x: x.filter(x.referenceName == "1")) self.assertEqual(transformedReads.toDF().count(), 1)
def test_save_sorted_sam(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) tmpPath = self.tmpFile() + ".sam" sortedReads = reads.sortByReferencePosition() sortedReads.saveAsSam(tmpPath, isSorted=True, asSingleFile=True) self.checkFiles(testFile, tmpPath)
def test_filterByOverlappingRegion(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) query = ReferenceRegion("chr2", 1, 400) filtered = reads.filterByOverlappingRegion(query) self.assertEqual(filtered.toDF().count(), 1)
def test_save(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() tmpPath = self.tmpFile() + ".coverage.adam" coverage.save(tmpPath, isSingleFile=True, disableFastConcat=True) self.checkFiles(testFile, tmpPath)
def test_to_coverage(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) coverage = reads.toCoverage() self.assertEquals(coverage.toDF().count(), 42) coverage = reads.toCoverage(collapse = False) self.assertEquals(coverage.toDF().count(), 46)
def test_filterByOverlappingRegions(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) querys = [ReferenceRegion("chr2", 1L, 400L), ReferenceRegion("3", 1L, 100L)] filtered = reads.filterByOverlappingRegions(querys) self.assertEquals(filtered.toDF().count(), 2)
def test_to_coverage(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) coverage = reads.toCoverage() self.assertEquals(coverage.toDF().count(), 42) coverage = reads.toCoverage(collapse=False) self.assertEquals(coverage.toDF().count(), 46)
def test_save(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) coverage = reads.toCoverage() tmpPath = self.tmpFile() + ".coverage.adam" coverage.save(tmpPath, asSingleFile=True, disableFastConcat=True) assert(os.listdir(tmpPath) != [])
def test_filterByOverlappingRegions(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(readsPath) querys = [ReferenceRegion("1", 20000000L, 27000000L), ReferenceRegion("1", 230000000L,270000000L)] filtered = reads.filterByOverlappingRegion(querys) self.assertEquals(filtered.toDF().count(), 6)
def test_union(self): testFile1 = self.resourceFile("sorted.sam") testFile2 = self.resourceFile("unordered.sam") ac = ADAMContext(self.ss) reads1 = ac.loadAlignments(testFile1) reads2 = ac.loadAlignments(testFile2) unionReads = reads1.union([reads2]) self.assertEqual(unionReads.toDF().count(), 13)
def test_alignment_distribution_no_elements(self): # load file ac = ADAMContext(self.ss) testFile = self.resourceFile("small.sam") # read alignments reads = ac.loadAlignments(testFile) qc = AlignmentDistribution(self.ss, reads, bin_size=1000000000) mDistribution = qc.plot(testMode=True, plotType="D") expectedM = Counter({('1', 0): 0}) assert (mDistribution == expectedM)
def test_save_sorted_sam(self): testFile = self.resourceFile("sorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(testFile) tmpPath = self.tmpFile() + ".sam" sortedReads = reads.sortReadsByReferencePosition() sortedReads.saveAsSam(tmpPath, isSorted=True, asSingleFile=True) self.checkFiles(testFile, tmpPath)
def test_pipe_as_sam(self): reads12Path = self.resourceFile("reads12.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(reads12Path) pipedRdd = reads.pipe(["tee", "/dev/null"], "org.bdgenomics.adam.rdd.read.SAMInFormatter", "org.bdgenomics.adam.rdd.read.AnySAMOutFormatter", "org.bdgenomics.adam.api.java.AlignmentRecordsToAlignmentRecordsConverter") self.assertEqual(reads.toDF().count(), pipedRdd.toDF().count())
def test_realignIndels_known_indels(self): readsPath = self.resourceFile("small.1.sam") variantsPath = self.resourceFile("small.vcf") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) knownIndels = ac.loadVariants(variantsPath) realigned = reads.realignIndelsFromKnownIndels(knownIndels) self.assertEqual(realigned.toDF().count(), 20)
def test_shuffle_right_outer_join_groupBy_left(self): readsPath = self.resourceFile("small.1.sam") targetsPath = self.resourceFile("small.1.bed") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) targets = ac.loadFeatures(targetsPath) jRdd = reads.rightOuterShuffleRegionJoinAndGroupByLeft(targets) self.assertEqual(jRdd.toDF().count(), 21)
def test_shuffle_inner_join(self): readsPath = self.resourceFile("small.1.sam") targetsPath = self.resourceFile("small.1.bed") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) targets = ac.loadFeatures(targetsPath) jRdd = reads.shuffleRegionJoin(targets) self.assertEqual(jRdd.toDF().count(), 5)
def test_broadcast_right_outer_join(self): readsPath = self.resourceFile("small.1.sam") targetsPath = self.resourceFile("small.1.bed") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) targets = ac.loadFeatures(targetsPath) jRdd = reads.rightOuterBroadcastRegionJoin(targets) self.assertEqual(jRdd.toDF().count(), 6)
def test_fail_on_invalid_sample(self): # load file ac = ADAMContext(self.ss) testFile = self.resourceFile("small.sam") # read alignments reads = ac.loadAlignments(testFile) # convert to coverage coverage = reads.toCoverage() with self.assertRaises(Exception): CoverageDistribution(self.ss, coverage, sample = 1.2) CoverageDistribution(self.ss, coverage, sample = 0)
def test_caching(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) cachedReads = reads.cache() cached = self.sc._jsc.getPersistentRDDs() self.assertEquals(cached.isEmpty(), False) cachedReads.unpersist() cached = self.sc._jsc.getPersistentRDDs() self.assertEquals(cached.isEmpty(), True)
def test_to_coverage(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) coverage = reads.toCoverage() self.assertEqual(coverage.toDF().count(), 5) # 5 reads: contig 3 has 8 bp, chr2 (2 strands) has 20bp, contig 4 has 8bp, contig 1 has 10 bp # 8 + 20 + 8 + 10 = 46 coverage = reads.toCoverage(collapse = False) self.assertEqual(coverage.toDF().count(), 46)
def test_transmute_to_coverage(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(readsPath) readsAsCoverage = reads.transmute( lambda x: x.select(x.contigName, x.start, x.end, x.mapq.cast(DoubleType()).alias("count")), CoverageRDD) assert (isinstance(readsAsCoverage, CoverageRDD)) self.assertEquals(readsAsCoverage.toDF().count(), 5)
def test_persisting(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) persistedReads = reads.persist(StorageLevel.DISK_ONLY) cached = self.sc._jsc.getPersistentRDDs() self.assertEqual(cached.isEmpty(), False) persistedReads.unpersist() cached = self.sc._jsc.getPersistentRDDs() self.assertEqual(cached.isEmpty(), True)
def test_transmute_to_coverage(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.sc) reads = ac.loadAlignments(readsPath) readsAsCoverage = reads.transmute(lambda x: x.select(x.contigName, x.start, x.end, x.mapq.cast(DoubleType()).alias("count")), CoverageRDD) assert(isinstance(readsAsCoverage, CoverageRDD)) self.assertEquals(readsAsCoverage.toDF().count(), 5)
def test_transmute_to_coverage(self): readsPath = self.resourceFile("unsorted.sam") ac = ADAMContext(self.ss) reads = ac.loadAlignments(readsPath) readsAsCoverage = reads.transmute(lambda x: x.select(x.referenceName, x.start, x.end, x.mappingQuality.cast(DoubleType()).alias("count"), x.readGroupSampleId.alias("optSampleId")), CoverageDataset) assert(isinstance(readsAsCoverage, CoverageDataset)) self.assertEquals(readsAsCoverage.toDF().count(), 5)
# Licensed to Big Data Genomics (BDG) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The BDG licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # from bdgenomics.adam.adamContext import ADAMContext from pyspark.context import SparkContext sc = SparkContext('local') ac = ADAMContext(sc) reads = ac.loadAlignments("adam-core/src/test/resources/small.sam").toDF().count() if reads == 20: exit(0) else: exit(1)