Exemplo n.º 1
0
def create_minhashes_from_sets(sets, num_perms, hashfunc, pad_for_asym=False):
    # Generate paddings for asym.
    max_size = max(len(s) for s in sets)
    paddings = dict()
    if pad_for_asym:
        padding_sizes = sorted(list(set([max_size - len(s) for s in sets])))
        for num_perm in num_perms:
            paddings[num_perm] = dict()
            for i, padding_size in enumerate(padding_sizes):
                if i == 0:
                    prev_size = 0
                    pad = MinHash(num_perm, hashfunc=hashfunc)
                else:
                    prev_size = padding_sizes[i - 1]
                    pad = paddings[num_perm][prev_size].copy()
                for w in range(prev_size, padding_size):
                    pad.update(str(w) + "_tmZZRe8DE23s")
                paddings[num_perm][padding_size] = pad
    # Generate minhash
    minhashes = dict()
    for num_perm in num_perms:
        print("Using num_perm = {}".format(num_perm))
        ms = []
        for s in sets:
            m = MinHash(num_perm, hashfunc=hashfunc)
            for word in s:
                m.update(str(word))
            if pad_for_asym:
                # Add padding to the minhash
                m.merge(paddings[num_perm][max_size - len(s)])
            ms.append(m)
            sys.stdout.write("\rMinhashed {} sets".format(len(ms)))
        sys.stdout.write("\n")
        minhashes[num_perm] = ms
    return minhashes
Exemplo n.º 2
0
def bootstrap_sets(sets_file,
                   sample_ratio,
                   num_perms,
                   skip=1,
                   pad_for_asym=False):
    print("Creating sets...")
    sets = collections.deque([])
    random.seed(41)
    with gzip.open(sets_file, "rt") as f:
        for i, line in enumerate(f):
            if i < skip:
                # Skip lines
                continue
            if random.random() > sample_ratio:
                continue
            s = np.array([int(d) for d in \
                    line.strip().split("\t")[1].split(",")])
            sets.append(s)
            sys.stdout.write("\rRead {} sets".format(len(sets)))
        sys.stdout.write("\n")
    sets = list(sets)
    keys = list(range(len(sets)))
    # Generate paddings for asym.
    max_size = max(len(s) for s in sets)
    paddings = dict()
    if pad_for_asym:
        padding_sizes = sorted(list(set([max_size - len(s) for s in sets])))
        for num_perm in num_perms:
            paddings[num_perm] = dict()
            for i, padding_size in enumerate(padding_sizes):
                if i == 0:
                    prev_size = 0
                    pad = MinHash(num_perm, hashfunc=_hash_32)
                else:
                    prev_size = padding_sizes[i - 1]
                    pad = paddings[num_perm][prev_size].copy()
                for w in range(prev_size, padding_size):
                    pad.update(str(w) + "_tmZZRe8DE23s")
                paddings[num_perm][padding_size] = pad
    # Generate minhash
    print("Creating MinHash...")
    minhashes = dict()
    for num_perm in num_perms:
        print("Using num_parm = {}".format(num_perm))
        ms = []
        for s in sets:
            m = MinHash(num_perm, hashfunc=_hash_32)
            for word in s:
                m.update(str(word))
            if pad_for_asym:
                # Add padding to the minhash
                m.merge(paddings[num_perm][max_size - len(s)])
            ms.append(m)
            sys.stdout.write("\rMinhashed {} sets".format(len(ms)))
        sys.stdout.write("\n")
        minhashes[num_perm] = ms

    return (minhashes, sets, keys)
    def _construct_contigs_minhash(self):
        print('begin_construct_contigs_minhash')

        contigs_minhash = MinHash(num_perm=2048)

        contigs_file = open(self.__contigs_path, 'r')
        contigs_lines = contigs_file.readlines()
        contigs_file.close()
        for contig in contigs_lines:
            if contig.startswith('>'):
                continue
            contigs_minhash.merge(self._construct_minhash(contig.strip()))

        print('end_construct_contigs_minhash')
        return contigs_minhash