예제 #1
0
    def __init__(
        self,
        fdir=None,
        name=None,
        source="body",
        redis_dir=None,
        basename=None,
        port=6379,
        host="localhost",
        num_perm=128,
        threshold=0.5,
        create=False,
        **kwargs,
    ):
        self.name = name or self.__class__.__name__
        self.indexed = False
        self.set_storage(fdir)

        self.source = source
        self.redis_dir = redis_dir if redis_dir and Path(
            redis_dir).exists() else None
        self.basename = str(fdir or self) if (basename is None) else basename
        self.port = port
        self.host = host
        self.num_perm = num_perm
        self.perm = datasketch.MinHash(num_perm=self.num_perm).permutations
        self.threshold = threshold

        self.lshindex = None
        self.ci_tidi = {}
        self.digests: typing.Any = None
        self.digests_list: typing.List[typing.Any] = []
예제 #2
0
    def index(self):

        self.digests = np.array(self.digests_list)
        del self.digests_list

        self.lshindex = self.make_lshindex()
        with self.lshindex.insertion_session() as session:
            ci_tidis = cluster.progress(self.ci_tidi.items(),
                                        f"Indexing {self.name}")
            for ci, (ti, di) in ci_tidis:
                mh = self.digests[di]
                name = f"{ti}-{ci}"
                m = datasketch.MinHash(num_perm=self.num_perm,
                                       permutations=self.perm,
                                       hashvalues=mh)
                session.insert(name, m, check_duplication=False)

        if self.redis_dir:
            r = self.cli("save")
            log.info(f"Saved redis with code {r.returncode}")

        self.indexed = True
        if self.storage:
            digestsdf = (pd.DataFrame(self.digests).reset_index().melt(
                id_vars=["index"], var_name="dim", value_name="val"))
            self.storage.save_df(digestsdf, "digests")
            self.storage.save_pickle(self.ci_tidi, "ci_tidi")
            self.storage.save_pickle(self.lshindex, "lshindex")
            self.close()
    def build_lsh(self, permutations=128):
        """Buld the LSH object up by injecting the minhashes into it.

        :param permutations: Number of permutations to use for minhashing.
        :type permutations: int
        :return: Returning self so that we can chain commands together.
        :rtype: LshDeduper
        """
        logger.info("Building minhashes...")
        for doc in common_tools.parse_data(self.data_dir):
            mh = datasketch.MinHash(num_perm=permutations)
            word_set = set([s.encode('utf-8') for s in doc["content"].split()])
            for word in word_set:
                mh.update(word)
            self.minhashes.append((doc["filename"], mh))

        if not self.lsh:
            raise Exception("Please first 'create_lsh' or 'load_lsh'")

        logger.debug("Inserting minhashes into lsh")
        with self.lsh.insertion_session() as session:
            for key, minhash in self.minhashes:
                session.insert(key, minhash)

        return self
예제 #4
0
def get_minhash_signatures(sets, num_perm=128):
    assert isinstance(sets, np.ndarray) and len(sets.shape) == 3
    hashes = []

    for i in range(0, len(sets)):
        m = dk.MinHash(num_perm=num_perm)

        split_into_shingles()
예제 #5
0
 def block(self, ti: int, cis: typing.Collection[int]):
     assert self.lshindex is not None
     for ci in cis:
         if ci in self.ci_tidi:
             ti, di = self.ci_tidi[ci]
             mh = self.digests[di]
             m = datasketch.MinHash(num_perm=self.num_perm,
                                    permutations=self.perm,
                                    hashvalues=mh)
             for name in self.lshindex.query(m):
                 ti, _ = map(int, name.split("-", 1))
                 yield ti
예제 #6
0
def minhash_data(data: List[Any]) -> datasketch.LeanMinHash:
    minhash = datasketch.MinHash(num_perm=256)

    for element in data:
        try:
            minhash.update(element.encode("utf-8"))
        except AttributeError as e:
            logger.warning(e)
            continue

    return datasketch.LeanMinHash(seed=minhash.seed,
                                  hashvalues=minhash.hashvalues)
예제 #7
0
 def __init__(self, process='m'):
     self.songname = ''
     self.fingerprint = datasketch.MinHash(num_perm=256)
     self.framerate = []
     if process == 'a':
         self.ask_user()
     elif process == 'm':
         pass
     else:
         if input(
                 'Enter "a" for automated fingerprinting or "m" to proceed manually: '
         ) == 'a':
             self.ask_user()
         else:
             sys.exit('''Error: Incorrect entry.''')
예제 #8
0
파일: main.py 프로젝트: jgrimm3/Recommenter
async def debugvid():
    recommenter = await Recommenter.create()  # type: Recommenter
    videos = recommenter.readFromSQL("videoInfo3.db")[:1000]
    for video in videos:
        if video.id in ["5DGwOJXSxqg", "mWXurqWRA74"]:
            print(f"Generating comment minhash for {video.id}")
            shingles = w_shingle(video.comment_content, SHINGLE_SIZE_COMMENTS)
            minhash = datasketch.MinHash(num_perm=800)
            rated_shingles = await recommenter.word_frequency.sort_shingles(shingles)
            print(f"Top 4 comment shingles for {video.id} are {rated_shingles[:4]}")
            for shingle in rated_shingles[:800]:
                minhash.update(shingle.encode('utf8'))
            minhash_id = f"{video.id}-comment"
            print(f"Storing minhash {minhash_id}")
            await recommenter.store_minhash(minhash_id, minhash)
            print(f"Inserting minhash {minhash_id}")
            await recommenter.comments.insert_minhash_obj(minhash_id, minhash)
예제 #9
0
def tree_sim(deprels):
    cfg = {
        'use_trunc_leaves': True,
        'use_drop_nodes': False,
        'use_replace_attr': False
    }
    mhash = []
    for deprel in deprels:
        adjac = [(index + 1, head, dep)
                 for index, (head, dep) in enumerate(deprel)]
        nested = ts.adjac_to_nested_with_attr(adjac)
        nested = ts.remove_node_ids(nested)
        shingled = ts.shingleset(nested, **cfg)
        stringified = [json.dumps(tree).encode('utf-8') for tree in shingled]
        m = datasketch.MinHash(num_perm=256)
        for s in stringified:
            m.update(s)
        mhash.append(m)
    return mhash[0].jaccard(mhash[1])
예제 #10
0
    def add(self, table):

        rows = []
        if self.source != "head":
            rows += list(
                tuple([cell.get("text", "").lower() for cell in r])
                for r in table["tableData"])
        if self.source != "body":
            rows += list(
                tuple([cell.get("text", "").lower() for cell in r])
                for r in table["tableHeaders"])
        cols = list(zip(*rows))

        if not table.get("numericColumns", []):

            def isnum(col):
                num = lambda x: x.translate(str.maketrans("", "", "-.,%")
                                            ).isnumeric()
                return sum(int(num(c)) for c in col) / len(col) > 0.5

            table["numericColumns"] = [
                i for i, c in enumerate(zip(*rows)) if isnum(c)
            ]

        ci_range = range(table["columnIndexOffset"],
                         table["columnIndexOffset"] + table["numCols"])
        ti = table["tableIndex"]
        for col, (ci, cells) in enumerate(zip(ci_range, cols)):
            if col not in table.get("numericColumns", []):
                cells = set(c for c in cells if c)
                if len(cells) > 0:
                    m = datasketch.MinHash(num_perm=self.num_perm,
                                           permutations=self.perm)
                    for c in cells:
                        m.update(c.encode("utf8"))
                    self.ci_tidi[ci] = (ti, len(self.digests_list))
                    self.digests_list.append(m.digest())
예제 #11
0
def minhash_dedupe(data_dir="news_data", threshold=0.75, permutations=128):
    """Deduplicate by creating the minhash approximation of a jaccard score.

    3rd party libraries:
        https://ekzhu.github.io/datasketch/minhash.html

    :param data_dir: Location of all documents.
    :type data_dir: str
    :param threshold: Threshold above which we consider two documents duplicates
    :type threshold: float
    :param permutations: Number of permutations to use for the minhash
    :type permutations: int
    :return: The minhash duplicates
    :rtype: list of floats
    """
    minhashes = []
    for doc in common_tools.parse_data(data_dir):
        mh = datasketch.MinHash(num_perm=permutations)
        words = [s.encode('utf-8') for s in doc["content"].split()]
        for word in words:
            mh.update(word)
        minhashes.append(mh)

    duplicates = []
    for i_doc in range(len(minhashes)):
        for j_doc in range(i_doc + 1, len(minhashes)):
            minhash_similarity = minhashes[i_doc].jaccard(minhashes[j_doc])

            is_duplicate = minhash_similarity >= threshold
            if is_duplicate:
                duplicates.append(minhash_similarity)

    # TODO cluster duplicates
    logger.info("Number of minhash duplicates with threshold {} = {}".format(threshold, len(duplicates)))

    return duplicates
예제 #12
0
    def cluster_by_similarites(self, threshold=0.7, num_perm=128):
        from scipy.cluster.hierarchy import linkage, cophenet, fcluster

        n_pieces = len(self.sets)
        n_rows = self.sets.shape[1] if isinstance(self.sets, np.ndarray) else len(self.sets[0])

        minhashes = []

        # pieces
        shingles_idx = 0
        for p in range(0, n_pieces):

            piece = self.sets[p]

            minhash = dk.MinHash(num_perm=num_perm)

            n_cols = len(piece[0])
            n_sequence = n_cols + self.k - 1
            n_shingle_elements = n_sequence - self.k + 1

            shingles = np.empty(n_rows * n_shingle_elements, dtype="S" + str(self.k))

            # iterating sequences from a region
            for s in range(0, n_rows):

                # input sequence considering surplus characters
                sequence = np.empty((n_sequence,), dtype="S1")
                sequence[0 : n_cols] = piece[s]

                if p != n_pieces - 1:  # if we aren't on the last piece:

                    next_piece = self.sets[p + 1]
                    sequence[n_cols:] = next_piece[s][0: self.k - 1]  # surplus
                else:

                    sequence[n_cols:] = 'Z'  # TODO: possivelmente substituir por valor mais provavel

                shingled_sequence = self.__split_into_shingles__(sequence)
                assert len(shingled_sequence) == n_shingle_elements, \
                    'Shingled sequence: ' + str(len(shingled_sequence)) + ' and fixed len ' + str(n_shingle_elements)

                print shingles_idx + n_cols

                shingles[shingles_idx: shingles_idx + n_shingle_elements] = shingled_sequence
                shingles_idx += n_shingle_elements

            for word in shingles:
                minhash.update(word)

            minhashes.append(minhash)
            shingles_idx = 0

        assert len(minhashes) == n_pieces
        print(shingles)

        distance_matrix = np.empty((n_pieces, n_pieces), dtype=np.float)

        for i in range(0, len(minhashes)):
            for j in range(0, len(minhashes)):

                if i == j:
                    distance_matrix[i][j] = 0
                else:

                    similarity = minhashes[i].jaccard(minhashes[j])

                    if similarity == 0:
                        distance_matrix[i][j] = 1
                    else:
                        distance_matrix[i][j] = 1 / similarity

        Z = linkage(distance_matrix)  # todo: test different metrics

        from scipy.cluster.hierarchy import dendrogram

        dendrogram(Z, show_leaf_counts=True)

        # import matplotlib.pyplot as plt
        # plt.show()
        # plt.savefig('dendrogram_' + str(self.k))

        return fcluster(Z, 0.70)
예제 #13
0
    def handleFlow(self):

        #TODO replace sorting loops with sorted function
        self.targets = {}
        self.api = []
        #flow = []

        addrs = []
        internals = []

        for instr in self.bb_insns:
            if isinstance(instr, CallInsn):
                if instr.is_api:
                    self.targets[instr.addr] = "API:" + instr.fcn_name

                    self.api.append({"name": instr.fcn_name})
                else:
                    internals.append(instr.addr)

            else:
                if instr.jumpout:
                    internals.append(instr.addr)
                else:
                    addrs.append(instr.addr)
                    addrs.append(instr.offset)

        addrs.sort()
        addrs_dict = {}
        for i in range(len(addrs)):
            addrs_dict[addrs[i]] = i

        internals_sorted = internals[:]
        internals_sorted.sort()
        calleds_dict = {}
        for i in range(len(internals_sorted)):
            calleds_dict[internals_sorted[i]] = str(i)

        flowhash = datasketch.MinHash(num_perm=32)

        for instr in self.bb_insns:
            if isinstance(instr, CallInsn):
                if instr.is_api:
                    #flow.append(hex(instr.offset)+"  API:" + instr.fcn_name)
                    flowhash.update("API:" + instr.fcn_name)
                else:
                    #flow.append(hex(instr.offset)+"  OUT:" + calleds_dict[instr.addr])
                    flowhash.update("OUT:" + calleds_dict[instr.addr])
                    self.targets[
                        instr.addr] = "OUT:" + calleds_dict[instr.addr]
            else:
                if instr.jumpout:
                    #flow.append(hex(instr.offset)+"  OUT:" + calleds_dict[instr.addr])
                    flowhash.update("OUT:" + calleds_dict[instr.addr])
                    self.targets[
                        instr.addr] = "OUT:" + calleds_dict[instr.addr]
                else:
                    off = addrs_dict[instr.offset]
                    tgt = addrs_dict[instr.addr]
                    #flow.append("%x (%d)   JMP:%s   - %x (%d)" % (instr.offset, off, str(tgt - off), instr.addr, tgt))
                    flowhash.update("JMP:" + str(tgt - off))
                    self.targets[instr.addr] = "JMP:" + str(tgt - off)

        lean_flowhash = datasketch.LeanMinHash(flowhash)
        flowhash_buf = bytearray(lean_flowhash.bytesize())
        lean_flowhash.serialize(flowhash_buf)

        self.flowhash = str(flowhash_buf)
        '''
예제 #14
0
    def handleInsns(self):
        consts = {}
        ips = []

        #set dafaukt value for PC, SP, BP
        pc_offset = self.arch.ip_offset
        regs = {pc_offset: 0, self.arch.sp_offset: 1, self.arch.bp_offset: 2}
        consts = {}
        irsbs = []

        for instr_c in range(len(self.insns_list)):
            off = self.insns_list[instr_c][0]
            instr = self.insns_list[instr_c][1]

            #manage instruction not recognized by libVEX
            if self.arch.name == "X86" or self.arch.name == "AMD64":
                if instr == "\xf4":  #hlt x86 instruction
                    irsbs.append("HALT")
                    continue
                elif instr.startswith("\xf0"):  #lock x86 prefix
                    irsbs.append("LOCK")
                    if len(instr) == 1:
                        continue
                    instr = instr[1:]
            try:
                irsb = pyvex.IRSB(instr, off, self.arch, opt_level=0)
            except pyvex.errors.PyVEXError as err:
                print(
                    "[Please report to the developer] Error with instruction "
                    + instr.encode("hex"))
                raise err
            irsbs.append(irsb)

            stmts = irsb.statements
            n_addr = 0

            for i in range(len(stmts)):
                #TODO PutI GetI
                if isinstance(stmts[i], pyvex.stmt.IMark):
                    n_addr = stmts[i].addr + stmts[i].len
                elif isinstance(stmts[i], pyvex.stmt.Put):

                    if stmts[i].offset == pc_offset and len(
                            stmts[i].constants) == 1:
                        c = stmts[i].constants[0]
                        if c.value in self.targets:
                            stmts[i].data = StrConst(self.targets[c.value])
                            stmts[i].offset = 0
                            continue
                        elif c.value == n_addr:
                            stmts[i].data = StrConst("_NEXT_")
                            stmts[i].offset = 0
                            continue
                        else:
                            ips.append(c.value)
                            stmts[i].reg_name = 0xABADCAFE
                            stmts[i].offset = 0
                    else:
                        # constants replace
                        for j in range(len(stmts[i].constants)):
                            c = stmts[i].constants[j]
                            if c.value in self.targets:
                                stmts[i].constants[j] = StrConst(
                                    self.targets[c.value])
                            elif c.value == n_addr:
                                stmts[i].constants[j] = StrConst("_NEXT_")
                            else:
                                # constants abstraction
                                consts[c.value] = consts.get(
                                    c.value, len(consts))
                                c.value = consts[c.value]

                        # registers abstraction
                        regs[stmts[i].offset] = regs.get(
                            stmts[i].offset, len(regs))
                        stmts[i].offset = regs[stmts[i].offset]
                elif isinstance(stmts[i], pyvex.stmt.Exit):
                    c = stmts[i].dst
                    if c.value in self.targets:
                        stmts[i] = "if (%s) { PUT(offset=0) = %s; %s }" % (
                            stmts[i].guard, self.targets[c.value],
                            stmts[i].jumpkind)
                        continue
                    else:
                        ips.append(c.value)
                        stmts[i].reg_name = 0xDEADBEEF
                else:
                    # constants replace
                    for j in range(len(stmts[i].constants)):
                        c = stmts[i].constants[j]
                        if c.value in self.targets:
                            stmts[i].constants[j] = StrConst(
                                self.targets[c.value])
                        elif c.value == n_addr:
                            stmts[i].constants[j] = StrConst("_NEXT_")
                        else:
                            # constants abstraction
                            consts[c.value] = consts.get(c.value, len(consts))
                            c.value = consts[c.value]

                for expr in stmts[i].expressions:
                    if isinstance(expr, pyvex.expr.Get):
                        # registers abstraction
                        regs[expr.offset] = regs.get(expr.offset, len(regs))
                        expr.offset = regs[expr.offset]

        #order addresses
        addrs = {}
        ips.sort()
        for i in range(len(ips)):
            addrs[ips[i]] = i

        #self.vex_code = ""
        #self.shingled_code = ""

        vexhash = datasketch.MinHash(num_perm=64)
        shingled = {}
        last = ""

        for c in range(len(irsbs)):
            irsb = irsbs[c]

            if type(irsb) == type(""):
                ngram = last + irsb
                #self.vex_code += "+++ Instr #%d +++\n%s\n" % (c, irsb)
                shingled[ngram] = shingled.get(ngram, 0) + 1
                last = irsb
                continue

            stmts = irsb.statements
            ins = ""

            for i in range(len(stmts)):
                if isinstance(stmts[i], pyvex.stmt.IMark) or isinstance(
                        stmts[i], pyvex.stmt.AbiHint):
                    continue

                if hasattr(stmts[i], "reg_name"):
                    if stmts[i].reg_name == 0xABADCAFE:
                        stmts[i].constants[0].value = addrs[
                            stmts[i].constants[0].value]
                    elif stmts[i].reg_name == 0xDEADBEEF:
                        stmts[i].dst.value = addrs[stmts[i].dst.value]

                v = str(stmts[i]) + "\n"
                ins += v
                ngram = last + v
                shingled[ngram] = shingled.get(ngram, 0) + 1
                last = v

            #self.vex_code += "+++ Instr #%d +++\n%s\n" % (c, ins)

        for ngram in shingled:
            for c in range(shingled[ngram]):
                vexhash.update("[%d]\n%s" % (c, ngram))
                #self.shingled_code += "[%d]\n%s" % (c, ngram)

        lean_vexhash = datasketch.LeanMinHash(vexhash)
        vexhash_buf = bytearray(lean_vexhash.bytesize())
        lean_vexhash.serialize(vexhash_buf)

        self.vexhash = str(vexhash_buf)
예제 #15
0
파일: main.py 프로젝트: jgrimm3/Recommenter
async def populateDatabase():
    recommenter = await Recommenter.create()  # type: Recommenter
    videos = recommenter.readFromSQL("videoInfo4.db")

    """
    i = 0
    for video in videos:
        # first we add the video's words to our word frequency index
        if video.id in recommenter.word_frequency.unique_doc['doc_ids']:
            print(f"Not adding {video.id}'s words to word frequency index, already indexed")
            continue
        else:
            print(f"Adding {video.id}'s words to word frequency index")
            for word in tqdm(' '.join([video.comment_content, video.transcript_content]).split(' ')):
                await recommenter.word_frequency.add_to_index(word, video.id)
            await recommenter.word_frequency.add_doc_id(video.id)
            if i == 10:
                await recommenter.word_frequency.upload_cache()
                i = -1
            i += 1
    
    """
    for video in videos:
        if video.id in ["5DGwOJXSxqg", "mWXurqWRA74", "jkGtMjkkmn4", "cqkiim_K0sc", "Ft00DUHRCOo", "FtX_oGO9MHo"]:
            print(f"Skipping {video.id} as its on the blacklist")
            continue
        """
        if video.has_enough_comments():
            minhash_id = f"{video.id}-comment"
            if (await recommenter.retrieve_minhash(minhash_id)) != None:
                print(f"Skipping video {video.id} because comment minhash stored, presumed already indexed")
                continue # skip video
            print(f"Generating comment minhash for {video.id}")
            shingles = w_shingle(video.comment_content, SHINGLE_SIZE_COMMENTS)
            minhash = datasketch.MinHash(num_perm=800)
            rated_shingles = await recommenter.word_frequency.sort_shingles(shingles)
            print(f"Top 4 comment shingles for {video.id} are {rated_shingles[:4]}")
            for shingle in rated_shingles[:800]:
                minhash.update(shingle.encode('utf8'))
            print(f"Storing minhash {minhash_id}")
            await recommenter.store_minhash(minhash_id, minhash)
            #print(f"Inserting minhash {minhash_id}")
            #await recommenter.comments.insert_minhash_obj(minhash_id, minhash)
        """
        if video.has_enough_transcripts():
            minhash_id = f"{video.id}-transcript-w{SHINGLE_SIZE_TRANSCRIPT}"
            if (await recommenter.retrieve_minhash(minhash_id)) != None:
                print(f"Skipping video {video.id} because transcript minhash stored, presumed already indexed.")
                continue # skip video
            print(f"Generating transcript minhash for {video.id}")
            shingles = w_shingle(video.transcript_content, SHINGLE_SIZE_TRANSCRIPT)
            minhash = datasketch.MinHash(num_perm=800)
            rated_shingles = await recommenter.word_frequency.sort_shingles(shingles)
            print(f"Top 4 transcript shingles for {video.id} are {rated_shingles[:4]}")
            for shingle in rated_shingles[:800]:
                minhash.update(shingle.encode('utf8'))
            print(f"Storing minhash {minhash_id}")
            await recommenter.store_minhash(minhash_id, minhash)
            #print(f"Inserting minhash {minhash_id}")
            #await recommenter.transcripts.insert_minhash_obj(minhash_id, minhash)

    await recommenter.close()
예제 #16
0
def calc_jaccard_similarities(sets, k=2, inter_alignments=False):
    assert len(set(len(subset) for subset in sets)) == 1
    assert isinstance(sets, np.ndarray)

    # print sets.shape, len(sets[0])
    # print len(sets[0]) - k + 1
    # print len(sets)

    assert isinstance(inter_alignments, bool)

    minhashes = []

    if inter_alignments:

        assert len(sets.shape) == 3

        shingles = np.empty(
            (sets.shape[0], sets.shape[1] * (sets.shape[2] - k + 1), k),
            dtype="S2")

        shingle_idx = 0
        set_row_len = sets.shape[2] - k + 1

        for i in range(0, sets.shape[0]):

            m = dk.MinHash()
            for j in range(0, sets.shape[1]):
                # print 'N shingles', len(sets[i][j]) - k + 1

                shingles[i, shingle_idx:shingle_idx +
                         set_row_len] = split_into_shingles(sets[i][j], k=k)
                shingle_idx += set_row_len

            shingle_idx = 0

            shingle_str = [''.join(s) for s in shingles[i].astype(str)]
            for s in shingle_str:
                m.update(s.encode('utf-8'))

            minhashes.append(m)

    # if not inter_alignments:
    else:
        shingles = np.zeros((len(sets), len(sets[0]) - k + 1, k), dtype="S2")

        for i in range(0, len(sets)):

            shingles[i] = split_into_shingles(sets[i], k=k)
            m = dk.MinHash()

            # for s in shingles[i]:
            shingle_str = [''.join(s) for s in shingles[i].astype(str)]
            for s in shingle_str:
                m.update(s.encode('utf-8'))
            minhashes.append(m)

    assert len(sets) == len(minhashes)

    if not inter_alignments:
        n_rows = len(sets) * (len(sets) - 1)
    else:

        n_rows = sets.shape[0] - 1

    # df = pd.DataFrame(data=np.zeros(permutations, 3), index='index', columns=['seq i', 'seq j', 'jaccard'], dtype=np.float)
    # jaccard_dict = {'jaccard' : np.zeros(permutations, dtype=np.float), 'seq i': np.zeros(permutations, dtype=np.int), 'seq j' : np.zeros(permutations, dtype=np.int)}

    jaccard_df = pd.DataFrame(
        np.empty((n_rows, ),
                 dtype=[('i', np.uint8), ('j', np.uint8),
                        ('jaccard', np.float)]))

    row = 0
    for i in range(0, len(sets)):
        for j in range(0, len(sets)):

            if i != j and not (
                (jaccard_df['i'] == 2) &
                (jaccard_df['j'] == 5)).any():  # excluding intersections

                str1 = [''.join(s) for s in shingles[i].astype(str)]
                str2 = [''.join(s) for s in shingles[j].astype(str)]

                jaccard = minhashes[i].jaccard(minhashes[j])

                # print i, j, float(len(set(str2) & set(str1))) / len(set(str2) | set(str1))

                jaccard_df['i'][row] = i
                jaccard_df['j'][row] = j
                jaccard_df['jaccard'][row] = jaccard

                row += 1

    # df = pd.DataFrame(data=jaccard_dict)
    return jaccard_df
예제 #17
0
def cluster_by_lsh(sets, k=2, num_perm=128):

    # list of 2d ndarrays or 3d ndarray
    assert (isinstance(sets, np.ndarray) and len(sets.shape) == 3) or (
        isinstance(sets, list)
        and all(isinstance(x, np.ndarray) and len(x.shape) == 2
                for x in sets))  # 3d ndarray
    assert isinstance(k, int) and k > 0

    n_pieces = len(sets)
    n_rows = sets.shape[1]
    n_cols = sets.shape[2]

    n_sequence = n_cols + k - 1
    n_shingle_elements = n_sequence - k + 1

    # shingles = np.empty((n_pieces, n_rows * (n_shingle_elements)), dtype="S" + str(k))

    minhashes = []

    # pieces
    shingles_idx = 0
    for p in range(0, n_pieces):

        piece = sets[p]

        minhash = dk.MinHash(num_perm=num_perm)
        shingles = np.empty(n_rows * n_shingle_elements, dtype="S" + str(k))

        # iterating sequences from a region
        for s in range(0, len(piece)):

            # input sequence considering surplus characters
            sequence = np.empty((n_sequence, ), dtype="S1")
            sequence[0:n_cols] = piece[s]

            if p != n_pieces - 1:  # if we aren't on the last piece:

                next_piece = sets[p + 1]
                sequence[n_cols:] = next_piece[s][0:k - 1]  # surplus
            else:

                sequence[
                    n_cols:] = 'Z'  # TODO: possivelmente substituir por valor mais provavel

            shingled_sequence = split_into_shingles(sequence, k=k)
            assert len(shingled_sequence) == n_shingle_elements, \
                'Shingled sequence: ' + str(len(shingled_sequence)) + ' and fixed len ' + str(n_shingle_elements)

            # print 'Seq len', len(sequence)
            # print 'Len', len(shingled_sequence), 'Seq', shingled_sequence
            # print 'Shingle len', len(shingles[p][shingles_idx : shingles_idx + n_cols + 1])
            # shingles[p][:, shingles_idx: shingles_idx + len(sequence) - 1] = shingled_sequence
            print shingles_idx + n_cols

            # shingles[p][shingles_idx : shingles_idx + n_shingle_elements] = shingled_sequence
            shingles[shingles_idx:shingles_idx +
                     n_shingle_elements] = shingled_sequence
            shingles_idx += n_shingle_elements

        for word in shingles:
            minhash.update(word)

        minhashes.append(minhash)
        shingles_idx = 0

        # shingle_str = [''.join(s) for s in shingles[piece].astype(str)]
        #for s in shingles[piece]:
        #    minhash.update(s.encode('utf-8'))

    assert len(minhashes) == n_pieces
    print shingles

    distance_matrix = np.empty((n_pieces, n_pieces), dtype=np.float)

    for i in range(0, len(minhashes)):
        for j in range(0, len(minhashes)):

            if i == j:
                distance_matrix[i][j] = 0
            else:

                similarity = minhashes[i].jaccard(minhashes[j])

                if similarity == 0:
                    distance_matrix[i][j] = 1
                else:
                    distance_matrix[i][j] = 1 / similarity

    print distance_matrix

    Z = linkage(
        distance_matrix)  # todo: ver quais metricas e metodos adequados

    from scipy.cluster.hierarchy import dendrogram

    dendrogram(Z, show_leaf_counts=True)

    import matplotlib.pyplot as plt
    plt.show()

    return fcluster(Z, 0.70)