예제 #1
0
def hash_code(s, h):
    output = []
    for w in s:
        h_v = hashing.hash_code(stemmer.stem(w))[h] % _SKETCH_BUCKET_SIZE
        output.append(h_v)

    return output
예제 #2
0
def pairs(mat, words):
    shape = mat.shape
    n = shape[1]

    for w1 in words:
        for w2 in words:

            hashcode = numpy.array(hashing.hash_code(w1)) % n
            h1 = hashcode[0]

            hashcode = numpy.array(hashing.hash_code(w2)) % n
            h2 = hashcode[0]

            if h1 > h2:
                continue

            v = mat[h1, h2]
            if v > 0.005:
                print(w1, w2, v)
예제 #3
0
def collision(mat, words):
    shape = mat.shape
    n = shape[1]

    counts = {}
    table = {}
    for i in xrange(_SKETCH_BUCKET_SIZE):
        counts[i] = 0
        table[i] = []

    for w in words:
        hashcode = numpy.array(hashing.hash_code(w)) % n

        id = hashcode[0]
        counts[id] += 1
        table[id].append(w)

    plt.plot(map(lambda x: counts[x], counts))
    plt.show()
    print table
예제 #4
0
def simplified_ex(_fstr, _sketch_status=None, direct=False):
    if _fstr:
        _f = gzip.open(_fstr, 'rb')
        sketch_status = cpickle.load(_f)
        _f.close()
    else:
        sketch_status = _sketch_status

    _t = datetime.datetime.utcfromtimestamp(sketch_status[0])
    _words = sketch_status[1]
    _m2 = sketch_status[2]
    _m3 = sketch_status[3]

    #######################
    mat = _m2[0]
    x = []  # for debugging
    for i in xrange(_SKETCH_BUCKET_SIZE):
        x.append(mat[i, i])

    id = np.argmax(np.array(x))
    for _w in _words:
        w = stemmer.stem(_w)
        if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id:
            print 'significant', _w
    #######################

    H = fast_hashing.HASH_NUMBER
    K = eval(config.get('sketch', 'num_topics'))  #15

    infer_results = map(
        lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K),
        range(H))

    if direct:
        return infer_results

    ### debugging
    print 'Inference finished.'
    ############

    transactions = []
    topics_group = []
    for h in xrange(H):
        topics = dict()
        a, r, v = infer_results[h]
        a_max = max(np.array(a).real)
        print a_max
        for k in xrange(K):
            s = set()
            topic = set()
            prob = v[:, k]

            prob = remove_negative_terms(prob)

            # filtering
            if a[k].real < 0.1 * a_max:  #1.0:
                continue
            if entropy(prob) > 6.0:
                continue

            _ranks = dict()
            for _w in _words:
                w = stemmer.stem(_w)
                p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE]
                _ranks[w] = p
                if p >= 0.0100:
                    s.add(w)
                if p >= 0.0075:
                    topic.add(w)

            _tops = sorted(_ranks.keys(),
                           key=lambda x: _ranks[x],
                           reverse=True)
            _top_n = 15
            if len(s) > _top_n:
                transactions.append(
                    apriori.Transaction(set(_tops[:_top_n]), h, k))
                #print _top_n
            else:
                transactions.append(apriori.Transaction(s, h, k))
                #print len(s)

            topics[k] = topic

            print h, k, a[k].real, map(lambda w, h: (w, h, _ranks[w]), s,
                                       hash_code(s, h))  # for debugging

        topics_group.append(topics)

    ### debugging
    print 'starting apriori.'
    #############

    output = apriori.apriori(transactions, 4)
    _result = dict()
    _result['time'] = _t
    _result['topics'] = list()

    print _t
    for ws in output:
        '''
        if support_distance(ws.support) > 5:
            continue'''

        _result['topics'].append((connect_words(recover(ws.words, _words)), connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \
            np.max(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))), \
            np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems())))))

    if _fstr:
        out_file = open('E:/experiment/results/' + _fstr.split('/')[-1], 'wb')
        cpk.dump(_result, out_file)
        out_file.close()
    else:
        return _result
예제 #5
0
def ex(_fstr):
    _f = gzip.open(_fstr, 'rb')
    sketch_status = cpickle.load(_f)
    _f.close()

    _t = datetime.datetime.utcfromtimestamp(sketch_status[0])
    _words = sketch_status[1]
    _m2 = sketch_status[2]
    _m3 = sketch_status[3]

    #######################
    mat = _m2[0]
    x = []  # for debugging
    for i in xrange(_SKETCH_BUCKET_SIZE):
        x.append(mat[i, i])

    id = np.argmax(np.array(x))
    for _w in _words:
        w = stemmer.stem(_w)
        if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id:
            print _w
    #######################

    H = 5
    K = 15

    t = time.time()
    infer_results = map(
        lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K),
        range(fast_hashing.HASH_NUMBER))
    print 't0 = ' + str(time.time() - t)

    t = time.time()
    transactions = []
    topics_group = []
    for h in xrange(H):
        topics = dict()
        a, r, v = infer_results[h]
        for k in xrange(K):
            s = set()
            topic = set()
            prob = v[:, k]

            prob = remove_negative_terms(prob)

            # filtering
            if a[k].real < 1.0:
                continue
            if entropy(prob) > 6.0:
                continue

            for _w in _words:
                w = stemmer.stem(_w)
                p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE]
                if p >= 0.0250:
                    s.add(w)
                if p >= 0.0150:
                    topic.add(w)

            transactions.append(apriori.Transaction(s, h, k))
            topics[k] = topic

            print h, k, a[k].real, map(lambda w, h: (w, h), s,
                                       hash_code(s, h))  # for debugging

        topics_group.append(topics)
    '''
    output = apriori.apriori(transactions, 3)
    for ws in output:
        print connect_words(recover(ws.words, _words)), np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems())))
    print '-------------------------------'
    '''

    output = apriori.apriori(transactions, 4)
    for ws in output:
        print '['
        print ws.support, support_distance(ws.support)
        print connect_words(recover(ws.words, _words)), np.max(
            np.array(
                map(lambda item: infer_results[item[0]][0][item[1]].real,
                    ws.support.iteritems())))
        print connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \
            np.max(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems()))), \
            np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems())))
        print ']'
    print '-------------------------------'
    '''
    output = apriori.apriori(transactions, 5)
    for ws in output:
        print '['
        print connect_words(recover(ws.words, _words)), np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems())))
        print connect_words(recover(join(map(lambda item: topics_group[item[0]][item[1]], ws.support.iteritems())), _words)), \
            np.median(np.array(map(lambda item: infer_results[item[0]][0][item[1]].real, ws.support.iteritems())))
        print ']'

    print '-------------------------------'
    '''

    print 't1 = ' + str(time.time() - t)
예제 #6
0
def ex5():
    _f = gzip.open(
        '/Users/weixie/Downloads/topicsketch_old/topicsketch_cut/20140120_12_33_22',
        'rb')
    sketch_status = cpickle.load(_f)
    _f.close()

    _t = datetime.datetime.utcfromtimestamp(sketch_status[0])
    _words = sketch_status[1]
    _m2 = sketch_status[2]
    _m3 = sketch_status[3]

    #######################
    mat = _m2[0]
    x = []  # for debugging
    for i in xrange(_SKETCH_BUCKET_SIZE):
        x.append(mat[i, i])

    id = np.argmax(np.array(x))
    for _w in _words:
        w = stemmer.stem(_w)
        if hashing.hash_code(w)[0] % _SKETCH_BUCKET_SIZE == id:
            print _w
    #######################

    H = 5
    K = 10

    t = time.time()
    infer_results = map(
        lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K),
        range(fast_hashing.HASH_NUMBER))
    print 't0 = ' + str(time.time() - t)

    t = time.time()
    candidates = []
    more_candidates = []
    for h in xrange(H):
        a, r, v = infer_results[h]
        candidate = []
        more_candidate = []
        for k in xrange(K):
            s = set()
            more_s = set()

            prob = v[:, k]

            prob = remove_negative_terms(prob)

            # filtering
            if a[k].real < 1.0:
                continue
            if entropy(prob) > 6.0:
                continue

            for _w in _words:
                w = stemmer.stem(_w)
                p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE]
                if p >= 0.025:
                    s.add(_w)
                if p >= 0.015:
                    more_s.add(_w)

            candidate.append(s)
            more_candidate.append(more_s)

        candidates.append(candidate)
        more_candidates.append(more_candidate)

    for h in xrange(H):
        print '------------------------------'
        for k in xrange(len(candidates[h])):
            print candidates[h][k]
        print '------------------------------'

    index = choose(candidates)

    for h in xrange(H):
        a, r, v = infer_results[h]
        plt.plot(v[:, h].real)
        plt.show()

    for h in xrange(H):
        print candidates[h][index[h]]

    topic_words = more_candidates[0][index[0]]

    for h in xrange(1, H):
        topic_words = topic_words.intersection(more_candidates[h][index[h]])

    output = ''
    for w in topic_words:
        output = output + w + ','

    print output

    print 't1 = ' + str(time.time() - t)
예제 #7
0
def ex4():
    _f = gzip.open(
        '/Users/weixie/Downloads/topicsketch_old/topicsketch_cut/20140128_21_52_28',
        'rb')
    sketch_status = cpickle.load(_f)
    _f.close()

    _t = datetime.datetime.utcfromtimestamp(sketch_status[0])
    _words = sketch_status[1]
    _m2 = sketch_status[2]
    _m3 = sketch_status[3]

    H = 5
    K = 50

    t = time.time()
    infer_results = map(
        lambda _h: solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, K),
        range(fast_hashing.HASH_NUMBER))
    print 't0 = ' + str(time.time() - t)

    t = time.time()
    candidates = []
    for h in xrange(H):
        a, r, v = infer_results[h]
        candidate = []
        for k in xrange(K):
            s = set()

            prob = v[:, k]

            prob = remove_negative_terms(prob)

            # filtering
            if a[k].real < 1.0:
                continue
            if entropy(prob) > 6.0:
                continue

            for _w in _words:
                w = stemmer.stem(_w)
                p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE]
                if p > 0.01:
                    s.add(_w)

            candidate.append(s)

        candidates.append(candidate)

    for h in xrange(H):
        print '------------------------------'
        for k in xrange(len(candidates[h])):
            print candidates[h][k]
        print '------------------------------'

    topic_words = candidates[0][-1]

    for h in xrange(1, H):
        topic_words = topic_words.union(candidates[h][-1])

    output = ''
    for w in topic_words:
        support = 0
        for h in xrange(H):
            if w in candidates[h][-1]:
                support += 1
        if support >= H - 1:
            output = output + w + ','

    print output

    print 't1 = ' + str(time.time() - t)
예제 #8
0
def ex2():

    _f = gzip.open(
        '/Users/weixie/Downloads/topicsketch_old/topicsketch_cut/20140120_12_33_22',
        'rb')
    sketch_status = cpickle.load(_f)
    _f.close()

    _t = datetime.datetime.utcfromtimestamp(sketch_status[0])
    _words = sketch_status[1]
    _m2 = sketch_status[2]
    _m3 = sketch_status[3]
    '''
    plt.matshow(numpy.absolute(m.toarray()[2400:2500, 2400:2500]), fignum=None, cmap=plt.cm.gray)
    plt.colorbar()
    plt.show()
    '''
    '''
    for h in xrange(5):
        a, r, v = solver.solve(_m2[h], _m3[h], _SKETCH_BUCKET_SIZE, 5)

        print sorted(a, key=lambda x: np.abs(x))



    #infer_results = map(lambda _h : solver.solve(_m2[_h], _m3[_h], _SKETCH_BUCKET_SIZE, 5), range(fast_hashing.HASH_NUMBER))
    '''
    h = 0
    K = 10

    mat = _m2[h]

    x = []
    for i in xrange(_SKETCH_BUCKET_SIZE):
        x.append(mat[i, i])

    plt.plot(x)
    plt.show()

    index = np.argmax(np.array(x))

    print index

    for _w in _words:
        w = stemmer.stem(_w)
        if hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE == index:
            print _w
    '''
    for y in sorted(x):
        print x.index(y), y
    '''

    a, r, v = solver.solve(_m2[h], _m3[h], _SKETCH_BUCKET_SIZE, K)
    print a
    print r

    print v[index, :]

    sorted_a = sorted(a, reverse=True)

    #k = a.index(max(a, key=lambda x: x.real))

    for _k in xrange(K):
        k = a.index(sorted_a[_k])

        prob = v[:, k]

        prob = remove_negative_terms(prob)

        print k, sorted_a[_k]
        print 'entropy', k, entropy(prob)

        plt.plot(prob)
        plt.show()

        for _w in _words:
            w = stemmer.stem(_w)
            p = prob[hashing.hash_code(w)[h] % _SKETCH_BUCKET_SIZE]
            if p > 0.025:
                print _w, p

        print '########################################'