Пример #1
0
	def add(self, docid) :
		vector = self.reader.read_docId(docid)
		maxdistvector = 0
		for i, vid in enumerate(self.vectorsid) :
			currvec = self.reader.read_docId(vid)
			currdist = cosdist(vector, currvec)
			if currdist > self.maxdist[i] :
				self.maxdist[i] = currdist
			if currdist > maxdistvector :
				maxdistvector = currdist
		self.vectorsid.append(docid)
		self.maxdist.append(maxdistvector)
		newclustroidid = self.vectorsid[np.argmin(self.maxdist)]
		if newclustroidid != self.clustroidid :
			self.clustroidid = newclustroidid
			self.clustroid = self.reader.read_docId(newclustroidid)
Пример #2
0
	def merge(self, cluster) :
		selfvectors = [self.reader.read_docId(i) for i in self.vectorsid]
		clustervectors = [cluster.reader.read_docId(i) for i in cluster.vectorsid]
		clustermaxdist = cluster.maxdist
		for i, selfvec in enumerate(selfvectors) :
			for j, clustervec in enumerate(clustervectors) :
				dist = cosdist(selfvec, clustervec)
				if dist > self.maxdist[i] :
					self.maxdist[i] = dist
				if dist > clustermaxdist[j] :
					clustermaxdist[j] = dist
		self.vectorsid.extend(cluster.vectorsid)
		self.maxdist.extend(clustermaxdist)
		newclustroidid = self.vectorsid[np.argmin(self.maxdist)]
		if newclustroidid != self.clustroidid :
			self.clustroidid = newclustroidid
			self.clustroid = self.reader.read_docId(newclustroidid)
Пример #3
0
#clusters = [Cluster(tfidf_list[c_idx]) for c_idx in centroids_idx]

# Inititalize centroids by hierarchical clustering on a sample
#nsample = math.ceil(0.05*npoint)
nsample = 5*ncluster
idxsample = rnd.sample(range(npoint), nsample)
docIdsample = [reader.read_idx(i)[0] for i in idxsample]
clusters_s = [Cluster(docIdsample[i], reader) for i in range(nsample)]
distmat = np.zeros((nsample, nsample))

print('Calculating initial clustroids')

for i in range(nsample) :
	distmat[i,i] = None
	for j in range(i+1,nsample) :
		dist = cosdist(clusters_s[i].clustroid, clusters_s[j].clustroid)
		distmat[i,j] = dist
		distmat[j,i] = dist

nmerged = nsample
while nmerged != ncluster :
	flatidx = np.nanargmin(distmat)
	i = flatidx//nsample
	j = flatidx%nsample
	clusters_s[i].merge(clusters_s[j])
	clusters_s[j] = None
	distmat[j,:] = None
	distmat[:,j] = None
	for k in range(nsample) :
		if clusters_s[k] != None and k != i :
			dist = cosdist(clusters_s[k].clustroid, clusters_s[i].clustroid)