def lsh2(kmer): def permute(x): return (2654435761 * x) % (2**32) shingle_len = 16 shingles = [] num_shingles = len(kmer)-shingle_len+1 global global_timer start_time = time.clock() for i in range(num_shingles): shingles.append(kmerutils.int_value(kmer[i:i+6])) global_timer += time.clock() - start_time value = min([permute(x) for x in shingles]) return value
def lsh(kmer): # Kind of bullshitting my way through this from info here: # http://nlp.stanford.edu/IR-book/html/htmledition/near-duplicates-and-shingling-1.html def permute(x, seed): random.seed(seed) return random.randint(0, x) # Params to do some search exploration over: # * shingle length # * number of seeds to use # * could we select the shingles in a strided pattern? shingle_len = 6 # because 2^6 == 64, 1 bit per shingle shingles = set() for i in range(len(kmer)-shingle_len+1): shingles.add(kmerutils.int_value(kmer[i:i+6])) results = [] for seed in SEEDS: min_permuted = min([permute(x, seed) for x in shingles]) results.append(min_permuted) value = 0 for result in results: value *= 2 value += result % 2 return value