Пример #1
0
	def run(self):

		emotion_scores = []
		emotion_rec = []
		relevances = []

		# Loading test data and representing them in the feature space.
		write("[+] Loading items:".ljust(54,'.')) if self.debug else ''
		if self.items is None: 
			self.items = json.loads(open(self.input_path).read()) 
		test_data, starts, ends = self.get_text() 
		X_test = []
		count = 0
		for i, sentences in enumerate(test_data):
			emotion_rec.append([])
			relevances.append([])
			emotion_scores.append([0. for k in range(12)])
			cur_x = lil_matrix((len(sentences), len(self.dictionary.keys())))
			for j, sentence in enumerate(sentences):
				relevances[i].append([0. for k in range(12)]) 
				sword_vector = self.vectorize_document(sentence)
				sfeature_vector = self.feature_extraction(sword_vector)
				for key, val in sfeature_vector:
					cur_x[j,key] = val 
				count += 1
			X_test.append(cur_x)
		write("[OK]\n") if self.debug else ''

		# Training and predicting each emotion class on the given data.
		write("[+] Modeling emotions:") if self.debug else ''
		for num_class in range(12):
			self.num_class = num_class + 1
			cur_X = self.X
			cur_Y = self.Y[:,(self.num_class - 1):self.num_class] 
			best = None
			dataset_name = "ted_talks"
			method = "ap_weights"
			rating_class = self.rating_classes[self.num_class-1]
			opt = []
			for b in ['_m1','','_m3']:
				tmp = open('parameters/%d/%s%s.txt' % (self.num_class,method,b))
				best = float(tmp.readlines()[0].split(": ")[1].split("}")[0])
				opt.append(best)
			try:
				write(("\n" + " "*8 + "-> %s"  % rating_class).ljust(55,'.')) if self.debug else ''
				f = gzip.open("models/%s_model.pk" % rating_class, 'rb')
				self.model = pickle.load(f) 
				write("[OK]") if self.debug else ''
			except:
				write("\n") if self.debug else ''
				self.model = APWeights(20, l1=opt[0], l2=opt[1], l3=opt[2],  reg=self)	 
				self.model.fit(cur_X, cur_Y)  
				f = gzip.open("models/%s_model.pk" % rating_class,"wb")
				pickle.dump(self.model, f)

			pred = self.model.predict(X_test)
			for j, val in enumerate(pred):
				emotion_scores[j][num_class] = val.view(np.ndarray)[0]

			for j, weights in enumerate(self.model.P_test): 
				for i, w in enumerate(weights):
					relevances[j][i][num_class] = w

		# Compute emotion-based recommendations
		write("\n[+] Generating recommendations".ljust(55,'.')) if self.debug else ''
		sim = np.zeros((len(X_test), len(X_test)))
		for i, v1 in enumerate(emotion_scores):
			for j, v2 in enumerate(emotion_scores):
				sim[i][j] = self.cosine_measure(v1, v2)
		write("[OK]") if self.debug else ''

		real_idxs = [nid for nid in range(len(X_test))]

		# Write results into a file
		write("\n[+] Saving to output file".ljust(55,'.')) if self.debug else ''
		output = self.items
		for j, h in enumerate(self.items):
			segments = test_data[j]
			h['segments'] = []
			for i,seg in enumerate(segments):
				seg_h = {'text':seg, 
						 'start':starts[j][i],
						 'end':ends[j][i],
						 'relevance_scores':relevances[j][i]}
				h['segments'].append(seg_h)
			top, top_sim = self.n_most_similar(j, sim, real_idxs)
			h['emotion_classes'] = self.rating_classes
			h['emotion_scores'] = emotion_scores[j]
			h['emotion_rec'] = [self.items[idx]['id'] for idx in top]
			h['emotion_rec_scores'] = top_sim

		json.dump(output, open(self.output_path,"wb"))
		write("[OK]\n") if self.debug else ''
		print "[x] Finished."
Пример #2
0
class Generator:
	def __init__(self, options, items=None):
		data_path = "data/ted_transcript_5k.p"
		data = pickle.load(open(data_path))
		self.dictionary = pickle.load(open(data_path.replace(".p",".dict")))
		self.tfidf = pickle.load(open(data_path.replace(".p",".tfidf")))
		self.X = data["X"]
		self.Y = data["Y"]
		self.items = items  
		self.words = data["words"]
		self.rating_classes = data["classes"]
		self.debug = options['--debug']	
		self.display = options['--display']		
		self.input_path = options['--input']
		self.output_path = options['--output']
		self.lim = 140 # maximum number of words per chunk

	def run(self):

		emotion_scores = []
		emotion_rec = []
		relevances = []

		# Loading test data and representing them in the feature space.
		write("[+] Loading items:".ljust(54,'.')) if self.debug else ''
		if self.items is None: 
			self.items = json.loads(open(self.input_path).read()) 
		test_data, starts, ends = self.get_text() 
		X_test = []
		count = 0
		for i, sentences in enumerate(test_data):
			emotion_rec.append([])
			relevances.append([])
			emotion_scores.append([0. for k in range(12)])
			cur_x = lil_matrix((len(sentences), len(self.dictionary.keys())))
			for j, sentence in enumerate(sentences):
				relevances[i].append([0. for k in range(12)]) 
				sword_vector = self.vectorize_document(sentence)
				sfeature_vector = self.feature_extraction(sword_vector)
				for key, val in sfeature_vector:
					cur_x[j,key] = val 
				count += 1
			X_test.append(cur_x)
		write("[OK]\n") if self.debug else ''

		# Training and predicting each emotion class on the given data.
		write("[+] Modeling emotions:") if self.debug else ''
		for num_class in range(12):
			self.num_class = num_class + 1
			cur_X = self.X
			cur_Y = self.Y[:,(self.num_class - 1):self.num_class] 
			best = None
			dataset_name = "ted_talks"
			method = "ap_weights"
			rating_class = self.rating_classes[self.num_class-1]
			opt = []
			for b in ['_m1','','_m3']:
				tmp = open('parameters/%d/%s%s.txt' % (self.num_class,method,b))
				best = float(tmp.readlines()[0].split(": ")[1].split("}")[0])
				opt.append(best)
			try:
				write(("\n" + " "*8 + "-> %s"  % rating_class).ljust(55,'.')) if self.debug else ''
				f = gzip.open("models/%s_model.pk" % rating_class, 'rb')
				self.model = pickle.load(f) 
				write("[OK]") if self.debug else ''
			except:
				write("\n") if self.debug else ''
				self.model = APWeights(20, l1=opt[0], l2=opt[1], l3=opt[2],  reg=self)	 
				self.model.fit(cur_X, cur_Y)  
				f = gzip.open("models/%s_model.pk" % rating_class,"wb")
				pickle.dump(self.model, f)

			pred = self.model.predict(X_test)
			for j, val in enumerate(pred):
				emotion_scores[j][num_class] = val.view(np.ndarray)[0]

			for j, weights in enumerate(self.model.P_test): 
				for i, w in enumerate(weights):
					relevances[j][i][num_class] = w

		# Compute emotion-based recommendations
		write("\n[+] Generating recommendations".ljust(55,'.')) if self.debug else ''
		sim = np.zeros((len(X_test), len(X_test)))
		for i, v1 in enumerate(emotion_scores):
			for j, v2 in enumerate(emotion_scores):
				sim[i][j] = self.cosine_measure(v1, v2)
		write("[OK]") if self.debug else ''

		real_idxs = [nid for nid in range(len(X_test))]

		# Write results into a file
		write("\n[+] Saving to output file".ljust(55,'.')) if self.debug else ''
		output = self.items
		for j, h in enumerate(self.items):
			segments = test_data[j]
			h['segments'] = []
			for i,seg in enumerate(segments):
				seg_h = {'text':seg, 
						 'start':starts[j][i],
						 'end':ends[j][i],
						 'relevance_scores':relevances[j][i]}
				h['segments'].append(seg_h)
			top, top_sim = self.n_most_similar(j, sim, real_idxs)
			h['emotion_classes'] = self.rating_classes
			h['emotion_scores'] = emotion_scores[j]
			h['emotion_rec'] = [self.items[idx]['id'] for idx in top]
			h['emotion_rec_scores'] = top_sim

		json.dump(output, open(self.output_path,"wb"))
		write("[OK]\n") if self.debug else ''
		print "[x] Finished."
	
	def get_text(self):
		all_chunks = []
		all_starts = []
		all_ends = []
		for i, talk in enumerate(self.items):
			all_chunks.append([])
			all_starts.append([])
			all_ends.append([])			
			chunk = []
			start = None
			end = None
			count = 0
			for seg in talk['segments']:
				if seg['classId'] == "speech":
					for word in seg['spokenWords']:
						if start is None:
							start = word['wordStart']
						else:
							end = word['wordEnd']
						chunk.append(word['wordId'])
						count += 1
				if count > self.lim:
					all_chunks[i].append(' '.join(chunk))
					all_starts[i].append(start)
					all_ends[i].append(end)
					chunk = []
					start = None
					end = None
					count = 0
		return all_chunks, all_starts, all_ends
	
	def vectorize_document(self, document):
		tokenizer = RegexpTokenizer(r'\b[A-z]+\b')
		words =  list(tokenizer.tokenize(document.lower()))
		return words

	def feature_extraction(self, vector_document):
		sparse_vector = self.dictionary.doc2bow(vector_document)
		sparse_vector = self.tfidf[sparse_vector]
		return sparse_vector

	def cosine_measure(self, v1, v2):
	    return (lambda (x, y, z): x / sqrt(y * z))(reduce(lambda x, y: (x[0] + \
	    		y[0] * y[1], x[1] + y[0]**2, x[2] + y[1]**2), izip(v1, v2), (0, 0, 0)))

	def n_most_similar(self, cur_idx, sim, real_idxs, N=10):
		similar = []
		confidence = []
		for idx in np.argsort(sim[cur_idx])[::-1][:N]:
			if idx != cur_idx:
				similar.append(real_idxs[idx])
				confidence.append(sim[cur_idx][idx])
		return similar, confidence