Exemplo n.º 1
0
def read_clusters(clusters_filename):
    """ mysekitei, regexpes=[(class, freq_features, their_indices)] """
    mysekitei = sekitei([], alpha=0.01)

    regexpes = []

    c, n = 0, 0
    res, indices = [], []
    with open(clusters_filename, 'r') as file:
        for line in file.readlines():
            if (line[0:3] == '---'):
                ls = line[3:].split()
                c, n = int(ls[0]), int(ls[2])
            elif (n):
                r, i = line.split()
                i = int(i)
                mysekitei.tags.add(r)
                mysekitei.tags_order[i] = r
                res.append(r)
                indices.append(i)
                n -= 1

                if not n:
                    regexpes.append([c, res, indices])
                    res, indices = [], []

            elif len(line) and line.split() and line.split()[0] == 'n_features=':
                mysekitei.n_features = int(line.split()[1])
                mysekitei.tags_order = [''] * mysekitei.n_features

    return mysekitei, regexpes
Exemplo n.º 2
0
def get_clusters(good_urls, urls, n_urls=500, my_dbs=False, verbose=False):
    """ """
    random.shuffle(good_urls)
    random.shuffle(urls)

    fit_urls = good_urls[:n_urls] + urls[:n_urls]

    mysekitei = sekitei(fit_urls, alpha=0.01)
    mysekitei.fit()

    X = mysekitei.most_freq_features()
    
    if my_dbs: py = dbscan().fit_predict(X)
    else:      py = DBSCAN().fit_predict(X)

    hist = []
    clusters = list( set(py) )
    with open('data/clusters_features.txt', 'w') as file:
        
        print >>file,  mysekitei.n_features
        print >>file, '\n\n\n', '\n'.join(mysekitei.tags_order[:mysekitei.n_features]), '\n\n'

        for c in clusters:
            hist.append(len([p for p in py if p == c]))
            # print >>f, c, ':', hist[-1]

    vizualize_clusters(X, ([1] * n_urls + [0] * n_urls), py, hist)
    
    regexpes = mysekitei.get_clusters_regexpes(X, py)

    with open('data/clusters_freq_features.txt', 'w') as file:
        print  'n_features=', mysekitei.n_features, '\n\n'
        print >>file, 'n_features=', mysekitei.n_features, '\n\n'
        for c,f,i in regexpes:
            print '---', c, '=', str(len(f))
            print '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n'
            print >>file, '---', c, '=', str(len(f))
            print >>file, '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n'

    with open('data/united_regexpes.txt', 'w') as file:
        for k,f,i in regexpes:
            rex = '^'
            for r in f[:-1]:
                rex += '(?=%s)' % r.strip('^').rstrip('$')
            rex += '%s' % f[-1].strip('^')
            print >>file, k, '=', rex

    return mysekitei, regexpes
Exemplo n.º 3
0
def get_clusters(good_urls, urls, n_urls=500, my_dbs=False, verbose=False):
    """ """
    random.shuffle(good_urls)
    random.shuffle(urls)

    fit_urls = good_urls[:n_urls] + urls[:n_urls]

    mysekitei = sekitei(fit_urls, alpha=0.01)
    mysekitei.fit()

    X = mysekitei.most_freq_features()
    
    if my_dbs: py = dbscan().fit_predict(X)
    else:      py = DBSCAN().fit_predict(X)
        
    regexpes = mysekitei.get_clusters_regexpes(X, py)

    print  'n_features=', mysekitei.n_features, '\n\n'
    for c,f,i in regexpes:
        print '---', c, '=', str(len(f))
        print '\n'.join([fi + '\t\t\t ' + str(ii) for fi,ii in zip(f,i)]), '\n'

    return mysekitei, regexpes