/
cluster.py
85 lines (72 loc) · 3.03 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations
import itertools
import silhoutte
def review_to_wordlist(review, lang, remove_stopwords=True):
review_text = re.sub("[^a-zA-Z]", " ", review)
words = review_text.lower().split()
if remove_stopwords:
if lang == "en":
with open("./stopwords/english.txt") as f:
stoplist = f.readlines()
stoplist = [x.strip('\n') for x in stoplist]
elif lang == "nl":
with open("./stopwords/dutch.txt") as f:
stoplist = f.readlines()
stoplist = [x.strip('\n') for x in stoplist]
else:
with open("./stopwords/greek.txt") as f:
stoplist = f.readlines()
stoplist = [x.strip('\n') for x in stoplist]
words = [w for w in words if not w in stoplist]
return words
def clustering(list_perproblem, lang):
vectorizer = TfidfVectorizer(analyzer="char", tokenizer=None, preprocessor=None, stop_words=None,
max_features=5000, min_df=2, ngram_range=(3, 8))
clean_text = []
for j in xrange(0, len(list_perproblem)):
clean_text.append(list_perproblem[j])
data_features = vectorizer.fit_transform(clean_text)
data_features = data_features.toarray()
n_start = int(len(data_features)/2)
n_stop = len(data_features)-5
step = 5
n_cls_range = range(n_start, n_stop, step)
n_cls = silhoutte.number_clusters(data_features, n_cls_range)
spectral = KMeans(n_clusters=n_cls).fit(data_features)
label = spectral.fit_predict(data_features)
return data_features, label
def clustering_word2vec(data_features):
n_start = int(len(data_features)/2)
n_stop = len(data_features)-5
step = 5
n_cls_range = range(n_start, n_stop, step)
n_cls = silhoutte.number_clusters(data_features, n_cls_range)
spectral = KMeans(n_clusters=n_cls).fit(data_features)
label = spectral.fit_predict(data_features)
return label
def similarity_score(list_all, dict_features):
list_all_comb = []
for i in xrange(len(list_all)):
list_comb_percluster = []
if len(list_all[i]) > 1:
for j in xrange(len(list_all[i])):
list_comb_percluster.append(list_all[i][j]["document"])
list_all_comb.append(list_comb_percluster)
combs = []
for i in xrange(len(list_all_comb)):
comb = list(combinations(list_all_comb[i], 2))
combs.append(comb)
comb_list = list(itertools.chain(*combs))
all_sim = []
for i in xrange(len(comb_list)):
doc1 = comb_list[i][0].split(".")
doc2 = comb_list[i][1].split(".")
vec1 = dict_features[doc1[0]]
vec2 = dict_features[doc2[0]]
sim = cosine_similarity(vec1, vec2)
all_sim.append(sim)
return comb_list, all_sim