/
purity.py
92 lines (69 loc) · 2.73 KB
/
purity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import f1_score
from dbscan import dbscan
from sklearn.cluster.dbscan_ import DBSCAN
from collections import defaultdict
import numpy as np
import math
import json
import sys
import re
import random
from sekitei import sekitei
def purity(mysekitei, regexpes, good_urls, urls, n_urls=500,
n_bootstreping=25, verbose=False):
""" Calculate the metrics of a quality
of the clusterization for a site
The estimation is the clearence of a cluster
with bootstrepping (averaging by random selections)
"""
estimation = 0.
classes = [ 0, 1 ]
clusters = [ v[0] for v in regexpes ]
clusters.sort()
if verbose: print 'clusters= %d\n' % len(clusters)
y = [1] * n_urls + [0] * n_urls
for step in xrange(n_bootstreping):
random.shuffle(good_urls)
random.shuffle(urls)
fit_urls = good_urls[:n_urls] + urls[:n_urls]
new_urls = good_urls[n_urls:2*n_urls] + urls[n_urls:2*n_urls]
P = mysekitei.matrix_of_existing_features(new_urls)
distrib = mysekitei.distribute_among_clusters(P, regexpes)
count0 = [0.] * len(clusters)
count1 = [0.] * len(clusters)
for i,c in enumerate(clusters):
for j,p in enumerate(distrib):
if p == c:
if j < n_urls: count0[i] += 1.
elif j >= n_urls: count1[i] += 1.
else: raise ValueError
estimation += sum([ max(c0, c1) for c0,c1 in zip(count0, count1) ]) / (2 * n_urls)
if verbose: print '%d estimation= %f' % (step, estimation)
return estimation / n_bootstreping
def read_clusters(clusters_filename):
""" mysekitei, regexpes=[(class, freq_features, their_indices)] """
mysekitei = sekitei([], alpha=0.01)
regexpes = []
c, n = 0, 0
res, indices = [], []
with open(clusters_filename, 'r') as file:
for line in file.readlines():
if (line[0:3] == '---'):
ls = line[3:].split()
c, n = int(ls[0]), int(ls[2])
elif (n):
r, i = line.split()
i = int(i)
mysekitei.tags.add(r)
mysekitei.tags_order[i] = r
res.append(r)
indices.append(i)
n -= 1
if not n:
regexpes.append([c, res, indices])
res, indices = [], []
elif len(line) and line.split() and line.split()[0] == 'n_features=':
mysekitei.n_features = int(line.split()[1])
mysekitei.tags_order = [''] * mysekitei.n_features
return mysekitei, regexpes