def create_minhashes_from_sets(sets, num_perms, hashfunc, pad_for_asym=False): # Generate paddings for asym. max_size = max(len(s) for s in sets) paddings = dict() if pad_for_asym: padding_sizes = sorted(list(set([max_size - len(s) for s in sets]))) for num_perm in num_perms: paddings[num_perm] = dict() for i, padding_size in enumerate(padding_sizes): if i == 0: prev_size = 0 pad = MinHash(num_perm, hashfunc=hashfunc) else: prev_size = padding_sizes[i - 1] pad = paddings[num_perm][prev_size].copy() for w in range(prev_size, padding_size): pad.update(str(w) + "_tmZZRe8DE23s") paddings[num_perm][padding_size] = pad # Generate minhash minhashes = dict() for num_perm in num_perms: print("Using num_perm = {}".format(num_perm)) ms = [] for s in sets: m = MinHash(num_perm, hashfunc=hashfunc) for word in s: m.update(str(word)) if pad_for_asym: # Add padding to the minhash m.merge(paddings[num_perm][max_size - len(s)]) ms.append(m) sys.stdout.write("\rMinhashed {} sets".format(len(ms))) sys.stdout.write("\n") minhashes[num_perm] = ms return minhashes
def bootstrap_sets(sets_file, sample_ratio, num_perms, skip=1, pad_for_asym=False): print("Creating sets...") sets = collections.deque([]) random.seed(41) with gzip.open(sets_file, "rt") as f: for i, line in enumerate(f): if i < skip: # Skip lines continue if random.random() > sample_ratio: continue s = np.array([int(d) for d in \ line.strip().split("\t")[1].split(",")]) sets.append(s) sys.stdout.write("\rRead {} sets".format(len(sets))) sys.stdout.write("\n") sets = list(sets) keys = list(range(len(sets))) # Generate paddings for asym. max_size = max(len(s) for s in sets) paddings = dict() if pad_for_asym: padding_sizes = sorted(list(set([max_size - len(s) for s in sets]))) for num_perm in num_perms: paddings[num_perm] = dict() for i, padding_size in enumerate(padding_sizes): if i == 0: prev_size = 0 pad = MinHash(num_perm, hashfunc=_hash_32) else: prev_size = padding_sizes[i - 1] pad = paddings[num_perm][prev_size].copy() for w in range(prev_size, padding_size): pad.update(str(w) + "_tmZZRe8DE23s") paddings[num_perm][padding_size] = pad # Generate minhash print("Creating MinHash...") minhashes = dict() for num_perm in num_perms: print("Using num_parm = {}".format(num_perm)) ms = [] for s in sets: m = MinHash(num_perm, hashfunc=_hash_32) for word in s: m.update(str(word)) if pad_for_asym: # Add padding to the minhash m.merge(paddings[num_perm][max_size - len(s)]) ms.append(m) sys.stdout.write("\rMinhashed {} sets".format(len(ms))) sys.stdout.write("\n") minhashes[num_perm] = ms return (minhashes, sets, keys)
def _construct_contigs_minhash(self): print('begin_construct_contigs_minhash') contigs_minhash = MinHash(num_perm=2048) contigs_file = open(self.__contigs_path, 'r') contigs_lines = contigs_file.readlines() contigs_file.close() for contig in contigs_lines: if contig.startswith('>'): continue contigs_minhash.merge(self._construct_minhash(contig.strip())) print('end_construct_contigs_minhash') return contigs_minhash