def test_prefix(self): trieobj = trie.trie() trieobj["hello"] = 5 trieobj["he"] = 7 trieobj["hej"] = 9 trieobj["foo"] = "bar" k = sorted(trieobj.keys()) self.assertEqual(k, ["foo", "he", "hej", "hello"]) self.assertEqual(trieobj["hello"], 5) self.assertEqual(trieobj.get("bye"), None) self.assertEqual(trieobj.has_key("hello"), True) self.assertEqual(trieobj.has_key("he"), True) self.assertEqual(trieobj.has_key("bye"), False) self.assertEqual(trieobj.has_prefix("h"), True) self.assertEqual(trieobj.has_prefix("hel"), True) self.assertEqual(trieobj.has_prefix("foa"), False) self.assertEqual(trieobj.has_prefix("hello world"), False) self.assertEqual(len(trieobj), 4) k = sorted(trieobj.with_prefix("he")) self.assertEqual(k, ["he", "hej", "hello"]) k = trieobj.with_prefix("l") self.assertEqual(k, []) k = trieobj.with_prefix("hej") self.assertEqual(k, ["hej"]) k = trieobj.with_prefix("hejk") self.assertEqual(k, [])
def test_save(self): trieobj = trie.trie() trieobj["foo"] = 1 k = list(trieobj.keys()) self.assertEqual(k, ["foo"]) v = list(trieobj.values()) self.assertEqual(v, [1]) self.assertEqual(trieobj.get("bar", 99), 99) trieobj["hello"] = '55a' self.assertEqual(trieobj.get_approximate("foo", 0), [("foo", 1, 0)]) self.assertEqual(trieobj.get_approximate("foo", 1), [("foo", 1, 0)]) self.assertEqual(trieobj.get_approximate("foa", 0), []) self.assertEqual(trieobj.get_approximate("foa", 1), [("foo", 1, 1)]) x = sorted(trieobj.get_approximate("foa", 2)) self.assertEqual(x, [("foo", 1, 1), ("foo", 1, 2), ("foo", 1, 2)]) # foo foo- foo- # foa f-oa fo-a # mismatch a->o # insertion after f, deletion of o # insertion after o, deletion of o x = trieobj.get_approximate("foo", 4) y = {} for z in x: y[z] = y.get(z, 0) + 1 x = sorted(y.items()) self.assertEqual(x, [(('foo', 1, 0), 1), (('hello', '55a', 4), 6)]) h = BytesIO() trie.save(h, trieobj) h.seek(0) trieobj = trie.load(h) k = list(trieobj.keys()) self.assertTrue("foo" in k) self.assertTrue("hello" in k) self.assertEqual(repr(trieobj["foo"]), '1') self.assertEqual(repr(trieobj["hello"]), "'55a'")
def test_large_save_load(self): """Generate random key/val pairs in three length categories. 100 items in each category. Insert them into a trie and into a reference dict. Write the trie to a temp file and read it back, verify that trie entries match the reference dict. """ cmp_dict = {} trieobj = trie.trie() self.assertEqual(trieobj.get("foobar"), None) for max_str_len in [100, 1000, 10000]: cmp_dict = {} for i in range(1000): key = ''.join([ random.choice(ascii_lowercase) for _ in range(max_str_len) ]) val = ''.join([ random.choice(ascii_lowercase) for _ in range(max_str_len) ]) trieobj[key] = val cmp_dict[key] = val for key in cmp_dict: self.assertEqual(trieobj[key], cmp_dict[key]) with tempfile.TemporaryFile(mode='w+b') as f: trie.save(f, trieobj) f.seek(0) trieobj = trie.load(f) for key in cmp_dict: self.assertEqual(trieobj[key], cmp_dict[key])
def test_save(self): trieobj = trie.trie() trieobj["foo"] = 1 k = trieobj.keys() self.assertEqual(k, ["foo"]) v = trieobj.values() self.assertEqual(v, [1]) self.assertEqual(trieobj.get("bar", 99), 99) trieobj["hello"] = "55a" self.assertEqual(trieobj.get_approximate("foo", 0), [("foo", 1, 0)]) self.assertEqual(trieobj.get_approximate("foo", 1), [("foo", 1, 0)]) self.assertEqual(trieobj.get_approximate("foa", 0), []) self.assertEqual(trieobj.get_approximate("foa", 1), [("foo", 1, 1)]) x = sorted(trieobj.get_approximate("foa", 2)) self.assertEqual(x, [("foo", 1, 1), ("foo", 1, 2), ("foo", 1, 2)]) # foo foo- foo- # foa f-oa fo-a # mismatch a->o # insertion after f, deletion of o # insertion after o, deletion of o x = trieobj.get_approximate("foo", 4) y = {} for z in x: y[z] = y.get(z, 0) + 1 x = sorted(y.items()) self.assertEqual(x, [(("foo", 1, 0), 1), (("hello", "55a", 4), 6)]) h = StringIO() trie.save(h, trieobj) h.seek(0) trieobj = trie.load(h) k = trieobj.keys() self.assertTrue("foo" in k) self.assertTrue("hello" in k) self.assertEqual(repr(trieobj["foo"]), "1") self.assertEqual(repr(trieobj["hello"]), "'55a'")
def get_list_of_candidates(target_string, PAM, gRNA_length, exclude_stop_codons, consider_negative, alt_pams): target = target_string if consider_negative: target_rev = reverse_complement(target) if alt_pams is None: PAMs = NFiller(PAM).get_list() else: for pam in alt_pams: if len(pam) != 3: raise ValueError('Length of one or more PAMs not set to 3') for character in pam: if character != 'A' and character != 'C' and character != 'G' and character != 'T': raise ValueError('Invalid PAM has been entered') PAMs = alt_pams candidates_rev = [] candidates = [] for PAM in PAMs: candidates.extend(find_candidates(target, PAM, gRNA_length)) if consider_negative: candidates_rev.extend(find_candidates(target_rev, PAM, gRNA_length)) trie_dic = trie.trie() for candidate in candidates: key = candidate[1] if exclude_stop_codons and ('TAG' in key or 'TAA' in key or 'TGA' in key): continue if key not in trie_dic.keys(): trie_dic[key] = '+' for candidate in candidates_rev: if exclude_stop_codons and ('TAG' in candidate[1] or 'TAA' in candidate[1] or 'TGA' in candidate[1]): continue key = reverse_complement(candidate[1]) if key not in trie_dic.keys(): trie_dic[key] = '-' return trie_dic
def test_find(self): from Bio import triefind trieobj = trie.trie() trieobj["hello"] = 5 trieobj["he"] = 7 trieobj["hej"] = 9 trieobj["foo"] = "bar" trieobj["wor"] = "ld" self.assertEqual(triefind.match("hello world!", trieobj), "hello") k = triefind.match_all("hello world!", trieobj) k.sort() self.assertEqual(k, ["he", "hello"]) k = triefind.find("hello world!", trieobj) k.sort() self.assertEqual(k, [("he", 0, 2), ("hello", 0, 5), ("wor", 6, 9)]) k = triefind.find_words("hello world!", trieobj) k.sort() self.assertEqual(k, [("hello", 0, 5)]) trieobj["world"] = "full" k = triefind.find("hello world!", trieobj) k.sort() self.assertEqual(k, [("he", 0, 2), ("hello", 0, 5), ("wor", 6, 9), ("world", 6, 11)]) k = triefind.find_words("hello world!", trieobj) k.sort() self.assertEqual(k, [("hello", 0, 5), ("world", 6, 11)])
def parse_barcode_file(fp, primer=None, header=False): """ Load label, barcode, primer records from a CSV file. Returns a map from barcode -> label Any additional columns are ignored """ tr = trie.trie() reader = csv.reader(fp) if header: # Skip header next(reader) # Skip blank rows records = (record for record in reader if record) for record in records: specimen, barcode = record[:2] if primer is not None: pr = primer else: pr = record[2] for sequence in all_unambiguous(barcode + pr): if tr.has_key(sequence): raise ValueError("Duplicate sample: {0}, {1} both have {2}", specimen, tr[sequence], sequence) logging.info('%s->%s', sequence, specimen) tr[sequence] = specimen return tr
def test_large_save_load(self): """Generate random key/val pairs in three length categories. 100 items in each category. Insert them into a trie and into a reference dict. Write the trie to a temp file and read it back, verify that trie entries match the reference dict. """ cmp_dict = {} trieobj = trie.trie() self.assertEqual(trieobj.get("foobar"), None) for max_str_len in [100, 1000, 10000]: cmp_dict = {} for i in range(1000): key = ''.join([random.choice(ascii_lowercase) for _ in range(max_str_len)]) val = ''.join([random.choice(ascii_lowercase) for _ in range(max_str_len)]) trieobj[key] = val cmp_dict[key] = val for key in cmp_dict: self.assertEqual(trieobj[key], cmp_dict[key]) with tempfile.TemporaryFile(mode='w+b') as f: trie.save(f, trieobj) f.seek(0) trieobj = trie.load(f) for key in cmp_dict: self.assertEqual(trieobj[key], cmp_dict[key])
def test_prefix(self): trieobj = trie.trie() trieobj["hello"] = 5 trieobj["he"] = 7 trieobj["hej"] = 9 trieobj["foo"] = "bar" k = sorted(trieobj.keys()) self.assertEqual(k, ["foo", "he", "hej", "hello"]) self.assertEqual(trieobj["hello"], 5) self.assertEqual(trieobj.get("bye"), None) self.assertTrue("hello" in trieobj) self.assertTrue("he" in trieobj) self.assertFalse("bye" in trieobj) self.assertTrue(trieobj.has_prefix("h")) self.assertTrue(trieobj.has_prefix("hel")) self.assertFalse(trieobj.has_prefix("foa")) self.assertFalse(trieobj.has_prefix("hello world")) self.assertEqual(len(trieobj), 4) k = sorted(trieobj.with_prefix("he")) self.assertEqual(k, ["he", "hej", "hello"]) k = trieobj.with_prefix("l") self.assertEqual(k, []) k = trieobj.with_prefix("hej") self.assertEqual(k, ["hej"]) k = trieobj.with_prefix("hejk") self.assertEqual(k, [])
def parse_barcode_file(fp, primer=None, header=False): """ Load label, barcode, primer records from a CSV file. Returns a map from barcode -> label Any additional columns are ignored """ tr = trie.trie() reader = csv.reader(fp) if header: # Skip header next(reader) # Skip blank rows records = (record for record in reader if record) for record in records: specimen, barcode = record[:2] if primer is not None: pr = primer else: pr = record[2] for sequence in all_unambiguous(barcode + pr): if sequence in tr: raise ValueError("Duplicate sample: {0}, {1} both have {2}", specimen, tr[sequence], sequence) logging.info('%s->%s', sequence, specimen) tr[sequence] = specimen return tr
def test_get_approximate(self): # Found bug, doesn't handle insertions and deletions at end properly. trieobj = trie.trie() trieobj["hello"] = 1 self.assertEqual(trieobj.get_approximate('he', 2), []) self.assertEqual(trieobj.get_approximate('he', 3), [('hello', 1, 3)]) self.assertEqual(trieobj.get_approximate('hello me!', 3), []) self.assertEqual(trieobj.get_approximate('hello me!', 4), [('hello', 1, 4)]) self.assertEqual(trieobj.get_approximate('hello me!', 5), [('hello', 1, 4)])
def test_get_approximate(self): # Found bug, doesn't handle insertions and deletions at end properly. trieobj = trie.trie() trieobj["hello"] = 1 self.assertEqual(trieobj.get_approximate("he", 2), []) self.assertEqual(trieobj.get_approximate("he", 3), [("hello", 1, 3)]) self.assertEqual(trieobj.get_approximate("hello me!", 3), []) self.assertEqual(trieobj.get_approximate("hello me!", 4), [("hello", 1, 4)]) self.assertEqual(trieobj.get_approximate("hello me!", 5), [("hello", 1, 4)])
def __init__(self): triefind.DEFAULT_BOUNDARY_CHARS = triefind.DEFAULT_BOUNDARY_CHARS.translate(None, utils.INCHI_SPECIAL_CHARS) self.index = trie.trie() for inchi in CompoundStructures.objects.all().values_list('standard_inchi', 'molecule_id'): for chunk in inchi[0].split('/')[1:]: if len(chunk) > 2: if self.index.get(str(chunk)): self.index[str(chunk)].append(inchi[1]) else: self.index[str(chunk)] = [inchi[1]]
def test_with_prefix(self): trieobj = trie.trie() s = "BANANA" for i in range(len(s)): # insert all suffixes into trie trieobj[s[i:]] = i self.assertEqual(trieobj[s[i:]], i) self.assertEqual(set(trieobj.values()), set(range(6))) self.assertEqual(set(["A", "ANA", "ANANA", "BANANA", "NA", "NANA"]), set(trieobj.keys())) self.assertEqual(set(["NA", "NANA"]), set(trieobj.with_prefix("N"))) self.assertEqual(set(["NA", "NANA"]), set(trieobj.with_prefix("NA"))) self.assertEqual(set(["A", "ANA", "ANANA"]), set(trieobj.with_prefix("A"))) self.assertEqual(set(["ANA", "ANANA"]), set(trieobj.with_prefix("AN")))
def test_with_prefix(self): trieobj = trie.trie() s = "BANANA" for i in range(len(s)): # insert all suffixes into trie trieobj[s[i:]] = i self.assertEqual(trieobj[s[i:]], i) self.assertEqual(set(trieobj.values()), set(range(6))) self.assertEqual({'A', 'ANA', 'ANANA', 'BANANA', 'NA', 'NANA'}, set(trieobj.keys())) self.assertEqual({'NA', 'NANA'}, set(trieobj.with_prefix("N"))) self.assertEqual({'NA', 'NANA'}, set(trieobj.with_prefix("NA"))) self.assertEqual({'A', 'ANA', 'ANANA'}, set(trieobj.with_prefix("A"))) self.assertEqual({'ANA', 'ANANA'}, set(trieobj.with_prefix("AN")))
def test_get_set(self): trieobj = trie.trie() trieobj["hello world"] = "s1" trieobj["bye"] = "s2" trieobj["hell sucks"] = "s3" trieobj["hebee"] = "s4" self.assertEqual(trieobj["hello world"], "s1") self.assertEqual(trieobj["bye"], "s2") self.assertEqual(trieobj["hell sucks"], "s3") self.assertEqual(trieobj["hebee"], "s4") trieobj["blah"] = "s5" self.assertEqual(trieobj["blah"], "s5") self.assertEqual(trieobj.get("foobar"), None) self.assertEqual(len(trieobj), 5) trieobj["blah"] = "snew" self.assertEqual(trieobj["blah"], "snew")
def construct_complex_trie(counter, lengths=None): t = trie.trie() seqs = list(counter) seqs.sort(key=len, reverse=True) if lengths is None: lengths = sorted(set([len(k) for k in seqs])) for seq in seqs: seq_len = len(seq) for l in lengths: if l > seq_len: continue for subseq in chunker(seq, l): if t.has_key(subseq): continue if subseq == seq: t[seq] = counter[seq] else: t[subseq] = seq return t
def gen_seq_tries(seqs, tol): # New: don't allow detection of any sequences whose length is equal to or # less than the tolerance. This prevents weird behavior and should be a # reasonable assumption for pretty much all use cases. # Newer: make sure sequences are at least of length 2. I think this is # necessitated by the new ability to specify the tolerance as a fraction, # so a length 0 sequence would slip through here. seqs = [seq for seq in seqs if len(seq) > max(tol, 2)] lengths = list(set([len(seq) for seq in seqs])) lengths.sort(key = lambda x: -x) trie_list = [trie() for i in range(len(lengths))] len_dict = {x:i for i,x in enumerate(lengths)} for seq in seqs: trie_list[len_dict[len(seq)]][seq] = 0 # Returns a list of tries, one for each sequence length, # sorted in descending order by sequence length. # FUTURE PLAN: allow for the observed sequence to match a different length # of reference sequence, basically so that insertions only cost 1 instead # of 2. return trie_list, lengths
def extract_peptides(sequences, min_length=7, max_length=20): """ Extract subsequences from full protein sequences, and return dictionary mapping each kmer to its source sequences. Parameters ---------- sequences : list of Sequence All generated protein sequences min_length : int Smallest peptide length to include max_length : int Largest peptide length to include Returns ------- Dictionary from str to list of Sequence objects which contained that peptide """ from Bio.trie import trie peptide_dict = trie() for sequence_obj in progressbar(sequences): amino_acids = sequence_obj.amino_acids n_aa = len(amino_acids) already_seen_for_protein = set() for i in range(n_aa - min_length + 1): longest_peptide = amino_acids[i:i + max_length] longest_peptide_length = len(longest_peptide) for k in range(min_length, longest_peptide_length): kmer = longest_peptide[:k] if kmer not in already_seen_for_protein: already_seen_for_protein.add(kmer) if kmer in peptide_dict: peptide_dict[kmer].append(sequence_obj) else: peptide_dict[kmer] = [sequence_obj] return peptide_dict
def resetSignatureDataStructure(self): self.signatureTrie, self.isEmpty = trie.trie(), True
def construct_simple_trie(counter): t = trie.trie() for seq, count in counter.iteritems(): t[seq] = count return t
#!/usr/bin/env python import StringIO from operator import truth from Bio import trie from Bio import triefind trieobj = trie.trie() trieobj["hello"] = 5 trieobj["he"] = 7 trieobj["hej"] = 9 trieobj["foo"] = "bar" trieobj["wor"] = "ld" print triefind.match("hello world!", trieobj) # "hello" k = triefind.match_all("hello world!", trieobj) k.sort() print k # ["he", "hello"] k = triefind.find("hello world!", trieobj) k.sort() print k # [("he", 0, 2), ("hello", 0, 5), ("wor", 6, 9)] k = triefind.find_words("hello world!", trieobj) k.sort() print k # [("hello", 0, 5)] trieobj["world"] = "full" k = triefind.find("hello world!", trieobj)
def setUp(self): self.tr = trie.trie() self.tr['1000']=12;self.tr['1011']=12; self.tr['1010']=12
def setUp(self): self.tr = trie.trie() self.tr['1000'] = 12 self.tr['1011'] = 12 self.tr['1010'] = 12
# How to perform efficient membership search on very large datasets in Python from Bio import trie tr = trie.trie()
def generate_adjacent_mers(sequence, max_hamming_distance): alphabet = 'AGCT' t = trie.trie() hamming_ball(sequence, max_hamming_distance, alphabet, t) return t
from Bio import trie #trie from the Biopython computational molecular biology library run_time = time.time() def t9_prediction(string, t9): """Return all the keys in the trie that match anywhere in the string.""" for word in t.with_prefix(string): if len(word) == len(string): t9.append(word) return t9 #words = [line.strip() for line in open("/home/n/work/py/VR/t9/english.txt", "rb")] words = [line.strip() for line in open(sys.argv[1], "rb")] t = trie.trie() for word in words: t[word] = 1 mapping = {1:["'"], 2:["a", "A", "b", "B", "c", "C"], \ 3:["d", "D", "e", "E", "f", "F"], \ 4:["g", "G", "h", "H", "i", "I"], \ 5:["j", "J", "k", "K", "l", "L"], \ 6:["m", "M", "n", "N", "o", "O"], \ 7:["p", "P", "q", "Q", "r", "R", "s","S"], \ 8:["t", "T", "u", "U", "v", "V"], \ 9:["w", "W", "x", "X", "y", "Y", "z","Z"]} \ while True: try: my_inputs = [int(i) for i in raw_input("Enter space separated inputs: ").split()]
import StringIO from operator import truth try : from Bio import trie except ImportError : import os from Bio import MissingExternalDependencyError if os.name=="java" : message = "Not available on Jython, Bio.trie requires compiled C code." else : message = "Could not import Bio.trie, check C code was compiled." raise MissingExternalDependencyError(message) trieobj = trie.trie() trieobj["hello"] = 5 trieobj["he"] = 7 trieobj["hej"] = 9 trieobj["foo"] = "bar" k = trieobj.keys() k.sort() print k # ["foo", "he", "hej", "hello"] print trieobj["hello"] # 5 print trieobj.get("bye") # None print trieobj.has_key("hello") # 1 print trieobj.has_key("he") # 1 print trieobj.has_key("bye") # 0
from itertools import product from Bio import trie t = trie.trie() dictionary = [word.strip() for word in open("dictionary.txt", "rb")] for word in dictionary: t[word] = len(word) t9Map = { 1: [""], 2: ["a", "b", "c"], 3: ["d", "e", "f"], 4: ["g", "h", "i"], 5: ["j", "k", "l"], 6: ["m", "n", "o"], 7: ["p", "q", "r", "s"], 8: ["t", "u", "v"], 9: ["w", "x", "y", "z"] } print "Enter a number:" input = input() strings = [] for char in str(input): strings.append(t9Map[int(char)]) result = [] for all in product( *(x for x in strings
def __init__ (self): self.index = trie.trie () self.modified = True
def resetSignatureDataStructure(self): self.signatureTrie, self.isEmpty = trie.trie(), True class SignaturePermutationWithSortedList(Permutation):
try : from Bio import trie except ImportError : import os from Bio import MissingExternalDependencyError if os.name=="java" : message = "Not available on Jython, Bio.trie requires compiled C code." else : message = "Could not import Bio.trie, check C code was compiled." raise MissingExternalDependencyError(message) from Bio import triefind trieobj = trie.trie() trieobj["hello"] = 5 trieobj["he"] = 7 trieobj["hej"] = 9 trieobj["foo"] = "bar" trieobj["wor"] = "ld" print triefind.match("hello world!", trieobj) # "hello" k = triefind.match_all("hello world!", trieobj) k.sort() print k # ["he", "hello"] k = triefind.find("hello world!", trieobj) k.sort() print k # [("he", 0, 2), ("hello", 0, 5), ("wor", 6, 9)]
#!/usr/bin/env python import StringIO from operator import truth from Bio import trie trieobj = trie.trie() trieobj["hello"] = 5 trieobj["he"] = 7 trieobj["hej"] = 9 trieobj["foo"] = "bar" k = trieobj.keys() k.sort() print k # ["foo", "he", "hej", "hello"] print trieobj["hello"] # 5 print trieobj.get("bye") # None print trieobj.has_key("hello") # 1 print trieobj.has_key("he") # 1 print trieobj.has_key("bye") # 0 print trieobj.has_prefix("h") # 1 print trieobj.has_prefix("hel") # 1 print trieobj.has_prefix("foa") # 0 print trieobj.has_prefix("hello world") # 0 print len(trieobj) # 4
import argparse import os import sys import platform from Bio import trie from Bio import triefind # Quick check for either linux or OSX if platform.system() == "Linux": prefix = "/home/tabboud" else: prefix = "/Users/tabboud" DROPBOX_DIR = prefix + '/Dropbox' IGNORE_FILE = '.dbignore' cache = trie.trie() def cache_patterns(ignore_file): """ Open and read the patterns in an ignore file Args: ignore_file: IgnoreFile object NOTES: see dir.c:add_excludes for reading the file and storing the patterns in el """ ignore_path = ignore_file.path if not os.path.isfile(ignore_path): print "ERROR: %s is not a file" % ignore_path return with open(ignore_path, 'r') as fp: for pattern in fp.readlines():