class TestContains(unittest.TestCase): def setUp(self): self.e = Elf(100) for i in range(100): self.e.add(str(i)) def testContains(self): for i in range(100): self.assert_(str(i) in self.e)
class TestAddMany(unittest.TestCase): def setUp(self): self.e = Elf(100) def testAdd(self): self.e.addmany((str(i) for i in range(100))) for i in range(100): self.assert_(str(i) in self.e) def testUpdate(self): self.e.update((str(i) for i in range(100))) for i in range(100): self.assert_(str(i) in self.e)
class TestSerialize(TestContains): def setUp(self): self.e = Elf(100) for i in range(100): self.e.add(str(i)) self.e.save("_t.elf") def testSave(self): self.assert_(os.path.exists("_t.elf")) def testLoad(self): e = Elf.load("_t.elf") self.assert_("1" in e) self.assert_("10" in e) def tearDown(self): os.unlink("_t.elf")
__doc__ %= sys.argv[0] if len(sys.argv) > 2: print sys.argv print __doc__ sys.exit() print >> sys.stderr, "Command: ", " ".join(sys.argv) infile = sys.argv[1] fp = FastQParser(infile) for _ in fp: pass records = fp.rread() print >> sys.stderr, records, "records in file ", infile # say 1 out of 1000 is false positive. bloom = Elf(records, error_rate=1e-3) fp.seek(0) checks = [] for _, seq, _, _ in fp: if seq in bloom: checks.append(seq) bloom.add(seq) # now checks contains anything that could be a duplicate according to # the bloomfilter. for some, they were false positives. # for actual duplicated, just choose the first, but can also sort by quality. fp.seek(0) checks = frozenset(checks) print >>sys.stderr, "checking %s potential duplicates in a python set" \ % len(checks) outfile = "%s-unique.fastq.gz" % infile.split(".")[0]
def setUp(self): self.e = Elf(100)
def testLoad(self): e = Elf.load("_t.elf") self.assert_("1" in e) self.assert_("10" in e)
def setUp(self): self.e = Elf(100) for i in range(100): self.e.add(str(i)) self.e.save("_t.elf")
def setUp(self): self.e = Elf(100) for i in range(100): self.e.add(str(i))
%s < in.fastq > out.unique.fastq """ from bloomfaster import Elf import collections import sys __doc__ %= sys.argv[0] if len(sys.argv) > 1: print sys.argv print __doc__ sys.exit() records = sum(1 for _ in sys.stdin) / 2 print >>sys.stderr, records, "records in file" # say 1 out of 1000 is false positive. bloom = Elf(records, error_rate=1e-3) sys.stdin.seek(0) readline = sys.stdin.readline checks = [] header = readline().rstrip() while header: seq = readline().rstrip() if seq in bloom: checks.append(seq) bloom.add(seq) header = readline().rstrip() # now checks contains anything that could be a duplicate according to # the bloomfilter. for some, they were false positives.