예제 #1
0
def train_extractor(extractor,filenames,window_size):
	extractor.train(windowed(filenames,window_size))
	extractor.finalise()
	return extractor.save()
예제 #2
0
import matplotlib.pyplot as plt
from collections import defaultdict
def plot_hist(bin_size,bin_list, upper =None):
	for bins in bin_list:
		fig = plt.figure()
		ax = fig.add_subplot(1,1,1)
		up_bound = upper or max(bins)
		x = [i for i in range(up_bound+1)]
		y = [bins[i] for i in range(up_bound+1)]
#		print x
#		print y
		ax.bar(x,y,width=1)
		plt.show()


docs = [' '.join(w[2]) for w,_ in windowed(sys.argv[2:],int(sys.argv[1]))]
tokenised_docs = [filter_tokenise(i) for i in docs]
num_topics = 3
lda = lda.LDASampler(
	docs=tokenised_docs,
	num_topics=num_topics, 
	alpha=0.25,
	beta=0.25)

print 'Sampling...'
for _ in range(100):
	zs = lda.assignments
	#print zs
	#print '[%i %i] [%i %i]' % (zs[0][3], zs[1][3], zs[2][3], zs[3][3])
	lda.next()
print