def load_Lexicon(self, Lexicon_file):
		input = codecs.open(Lexicon_file, 'r', 'utf-8')
		in_json = json.load(input)
		input.close()
		self.feature_list = in_json['feature_list']
		self.unigram = in_json['feature_unigram']
		self.bigram = in_json['feature_bigram']
		self.trigram = in_json['feature_trigram']
		self.UNI_LEX = StrKeyDict2TupleKeyDict(in_json['UNI_LEX'])
		self.BI_LEX = StrKeyDict2TupleKeyDict(in_json['BI_LEX'])
		self.TRI_LEX = StrKeyDict2TupleKeyDict(in_json['TRI_LEX'])
		self.UNI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['UNI_LEX_weight'])
		self.BI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['BI_LEX_weight'])
		self.TRI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['TRI_LEX_weight'])

		self.TOPIC_LEX = in_json['TOPIC_LEX']
		self.BASELINE_LEX = in_json['BASELINE_LEX']

		if 'ValueMatchFeature' in in_json and in_json['ValueMatchFeature']:
			self.ValueMatchFeature = ValueMatchFeature(self.tagsets)
			self.ValueMatchFeature.Load(in_json['ValueMatchFeature'])
		else:
			self.ValueMatchFeature = None

		self.tokenizer_mode = in_json['tokenizer_mode']
		self.tokenizer = tokenizer(self.tokenizer_mode)

		self.use_stemmer = in_json['use_stemmer']
		self.stemmer = stemmer(self.use_stemmer)

		self.remove_stopwords = in_json['remove_stopwords']
		self.remove_punctuation = in_json['remove_punctuation']
		self.replace_num = in_json['replace_num']
		self.ngram_builder = NGRAM_builder(self.remove_stopwords,self.remove_punctuation,self.replace_num)

		self._prepare_resources()
class feature(object):
	MY_ID = 'svc_feature'
	def __init__(self, tagsets, tokenizer_mode=None, use_stemmer=None, remove_stopwords=None):
		self.config = GetConfig()
		self.appLogger = logging.getLogger(self.MY_ID)

		# tokenizer
		if tokenizer_mode:
			self.tokenizer_mode = tokenizer_mode
		else:
			self.tokenizer_mode = self.config.get(self.MY_ID,'tokenizer_mode')
		self.appLogger.debug('tokenizer mode: %s' %(self.tokenizer_mode))
		self.tokenizer = tokenizer(self.tokenizer_mode)

		# stemmer
		if use_stemmer == None:
			use_stemmer = self.tokenizer_mode = self.config.getboolean(self.MY_ID,'use_stemmer')
		self.appLogger.debug('use stemmer ? %s' %(use_stemmer))	
		self.use_stemmer = use_stemmer
		self.stemmer = stemmer(use_stemmer)

		# ngram builder
		if remove_stopwords == None:
			self.remove_stopwords = self.config.getboolean(self.MY_ID,'remove_stopwords')
		else:
			self.remove_stopwords = remove_stopwords
		self.remove_punctuation = self.config.getboolean(self.MY_ID,'remove_punctuation')
		self.replace_num = self.config.getboolean(self.MY_ID,'replace_num')
		self.ngram_builder = NGRAM_builder(self.remove_stopwords,self.remove_punctuation,self.replace_num)

		self.tagsets = tagsets

		self.feature_list = None
		self.unigram = False
		self.bigram = False
		self.trigram = False
		

		# feature vector
		self.UNI_LEX = None
		self.BI_LEX = None
		self.TRI_LEX = None

		self.UNI_LEX_weight = None
		self.BI_LEX_weight = None
		self.TRI_LEX_weight = None

		self.TOPIC_LEX = None
		self.BASELINE_LEX = None

		self.ValueMatchFeature = None
		
		self.TOPIC_LEX_offset = 0
		self.UNI_LEX_offset = 0
		self.BI_LEX_offset = 0
		self.TRI_LEX_offset = 0
		self.BASELINE_LEX_offset = 0
		self.VMF_offset = 0
		self.is_set = False

	def _set_offset(self):
		self.TOPIC_LEX_offset = 0
		self.UNI_LEX_offset = 0
		self.BI_LEX_offset = 0
		self.TRI_LEX_offset = 0
		self.BASELINE_LEX_offset = 0

		if 'TOPIC' in self.feature_list:
			self.UNI_LEX_offset = self.TOPIC_LEX_offset + len(self.TOPIC_LEX)

		if self.unigram:
			self.BI_LEX_offset = self.UNI_LEX_offset + len(self.UNI_LEX)

		if self.bigram:
			self.TRI_LEX_offset = self.BI_LEX_offset + len(self.BI_LEX)

		if self.trigram:
			self.BASELINE_LEX_offset = self.TRI_LEX_offset + len(self.TRI_LEX)

		if 'BASELINE' in self.feature_list and self.ValueMatchFeature:
			self.VMF_offset = self.BASELINE_LEX_offset + self.ValueMatchFeature.GetFeatureSize()

	def _preprocessing(self, sent):
		'''
		convert to lower type
		tokenization and stemming
		'''
		sent = sent.lower()
		tokens = self.tokenizer.tokenize(sent)
		new_tokens = self.ngram_builder.PreReplace(tokens)
		new_tokens = [self.stemmer.stem(tk) for tk in new_tokens]
		return new_tokens


	def _prepare_resources(self):
		self._set_offset()
		self.is_set = True

	def load_Lexicon(self, Lexicon_file):
		input = codecs.open(Lexicon_file, 'r', 'utf-8')
		in_json = json.load(input)
		input.close()
		self.feature_list = in_json['feature_list']
		self.unigram = in_json['feature_unigram']
		self.bigram = in_json['feature_bigram']
		self.trigram = in_json['feature_trigram']
		self.UNI_LEX = StrKeyDict2TupleKeyDict(in_json['UNI_LEX'])
		self.BI_LEX = StrKeyDict2TupleKeyDict(in_json['BI_LEX'])
		self.TRI_LEX = StrKeyDict2TupleKeyDict(in_json['TRI_LEX'])
		self.UNI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['UNI_LEX_weight'])
		self.BI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['BI_LEX_weight'])
		self.TRI_LEX_weight = StrKeyDict2TupleKeyDict(in_json['TRI_LEX_weight'])

		self.TOPIC_LEX = in_json['TOPIC_LEX']
		self.BASELINE_LEX = in_json['BASELINE_LEX']

		if 'ValueMatchFeature' in in_json and in_json['ValueMatchFeature']:
			self.ValueMatchFeature = ValueMatchFeature(self.tagsets)
			self.ValueMatchFeature.Load(in_json['ValueMatchFeature'])
		else:
			self.ValueMatchFeature = None

		self.tokenizer_mode = in_json['tokenizer_mode']
		self.tokenizer = tokenizer(self.tokenizer_mode)

		self.use_stemmer = in_json['use_stemmer']
		self.stemmer = stemmer(self.use_stemmer)

		self.remove_stopwords = in_json['remove_stopwords']
		self.remove_punctuation = in_json['remove_punctuation']
		self.replace_num = in_json['replace_num']
		self.ngram_builder = NGRAM_builder(self.remove_stopwords,self.remove_punctuation,self.replace_num)

		self._prepare_resources()


	def save_Lexicon(self, Lexicon_file):
		output = codecs.open(Lexicon_file, 'w', 'utf-8')
		out_json = {}
		out_json['tokenizer_mode'] = self.tokenizer_mode
		out_json['use_stemmer'] = self.use_stemmer
		out_json['remove_stopwords'] = self.remove_stopwords
		out_json['remove_punctuation'] = self.remove_punctuation
		out_json['replace_num'] = self.replace_num

		out_json['feature_list'] = self.feature_list
		out_json['feature_unigram'] = self.unigram
		out_json['feature_bigram'] = self.bigram
		out_json['feature_trigram'] = self.trigram

		out_json['UNI_LEX'] = TupleKeyDict2StrKeyDict(self.UNI_LEX)
		out_json['BI_LEX'] = TupleKeyDict2StrKeyDict(self.BI_LEX)
		out_json['TRI_LEX'] = TupleKeyDict2StrKeyDict(self.TRI_LEX)
		out_json['UNI_LEX_weight'] = TupleKeyDict2StrKeyDict(self.UNI_LEX_weight)
		out_json['BI_LEX_weight'] = TupleKeyDict2StrKeyDict(self.BI_LEX_weight)
		out_json['TRI_LEX_weight'] = TupleKeyDict2StrKeyDict(self.TRI_LEX_weight)

		out_json['TOPIC_LEX'] = self.TOPIC_LEX
		out_json['BASELINE_LEX'] = self.BASELINE_LEX

		if self.ValueMatchFeature:
			out_json['ValueMatchFeature'] = self.ValueMatchFeature.Save()
		else:
			out_json['ValueMatchFeature'] = None
		json.dump(out_json, output, indent=4)
		output.close()


	def Stat_Lexicon(self, train_samples, label_samples,  feature_list = ['TOPIC', 'NGRAM_u:b', 'BASELINE', 'VALUE_MATCH']):
		'''
		train samples is a list of samples
		each item is a list , each item of the list is correspond to the feature list
		'''
		if len(train_samples) != len(label_samples):
			self.appLogger.error('Error: size of train samples and label samples mismatch! %d : %d' %(len(train_samples), len(label_samples)))
			raise Exception('Error: size of train samples and label samples mismatch! %d : %d' %(len(train_samples), len(label_samples)))
		if len(train_samples) == 0:
			self.appLogger.error('Error: No samples!')
			raise Exception('Error: No samples!')

		self.feature_list = feature_list
		sample_field_num = len(train_samples[0])
		if sample_field_num != len(self.feature_list):
			self.appLogger.error('Error: size of sample field num and feature list mismatch! %d : %d' %(sample_field_num, len(self.feature_list)))
			raise Exception('Error: size of sample field num and feature list mismatch! %d : %d' %(sample_field_num, len(self.feature_list)))
		'''
		print feature_list
		print train_samples[0]
		print label_samples[0]
		'''

		for feature in feature_list:
			if feature.startswith('NGRAM'):
				ngram_feature = feature[6:]
				tokens = ngram_feature.split(':')
				for t in tokens:
					if t == 'u':
						self.unigram = True
						continue
					elif t == 'b':
						self.bigram = True
						continue
					elif t == 't':
						self.trigram = True
						continue
					else:
						self.appLogger.error('Unknown ngram feature! %s' %(ngram_feature))
						raise Exception('Unknown ngram feature! %s' %(ngram_feature))

		for i, feature in enumerate(self.feature_list):
			if feature == 'TOPIC':
				#print i
				topic_samples = [train_sample[i] for train_sample in train_samples]
				#print topic_samples[0:3]
				self.TOPIC_LEX = self._stat_lexicon(topic_samples, threshold = 0)
			elif feature == 'BASELINE':
				#print i
				baseline_samples = [train_sample[i] for train_sample in train_samples]
				#print baseline_samples[0:3]
				self.BASELINE_LEX = self._stat_lexicon(baseline_samples, threshold = 0)
			elif feature.startswith('NGRAM'):
				#print i
				sent_samples = [train_sample[i] for train_sample in train_samples]
				#print sent_samples[0:3]
				unigram_lists = []
				bigram_lists = []
				trigram_lists = []
				for sents in sent_samples:
					for sent in sents:
						#print sent
						tokens = self._preprocessing(sent)
						if self.unigram:
							unigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,1))
						if self.bigram:
							bigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,2))
						if self.trigram:
							trigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,3))

				if self.unigram:
					self.UNI_LEX = self._stat_lexicon(unigram_lists, threshold=2)
					self.UNI_LEX_weight = self._calc_feature_weight(unigram_lists, label_samples, self.UNI_LEX, 'simple')
				if self.bigram:
					self.BI_LEX = self._stat_lexicon(bigram_lists, threshold=2)
					self.BI_LEX_weight = self._calc_feature_weight(bigram_lists, label_samples, self.BI_LEX,'simple')
				if self.trigram:
					self.TRI_LEX = self._stat_lexicon(trigram_lists, threshold=2)
					self.TRI_LEX_weight = self._calc_feature_weight(trigram_lists, label_samples, self.TRI_LEX,'simple')
			elif feature == 'VALUE_MATCH':
				self.ValueMatchFeature = ValueMatchFeature(self.tagsets)
			else:
				self.appLogger.error('Unknown feature! %s' %(feature))
				raise Exception('Unknown feature! %s' %(feature))
		return

	def _calc_feature_weight(self, feature_lists, label_samples, lexcion, method = 'simple'):
		lexicon_weight = {}
		if method == 'simple':
			for key in lexcion:
				lexicon_weight[key] = 1
		elif method == 'IDF':
			for key in lexcion:
				lexicon_weight[key] = 0.0
			N = len(feature_lists)
			for feature_list in feature_lists:
				f_list = list(set(feature_list))
				for f in f_list:
					if f in lexcion:
						lexicon_weight[f] += 1
			for f in lexicon_weight:
				lexicon_weight[f] = math.log(N/lexicon_weight[f])
		else:
			self.appLogger.error('Unknown weight calculate method! %s' %(method))
			raise Exception('Unknown weight calculate method! %s' %(method))

		return lexicon_weight

	def _stat_lexicon(self, feature_lists, threshold):
		lexicon_count = {}
		for feature in feature_lists:
			for f in feature:
				if f in lexicon_count:
					lexicon_count[f] += 1
				else:
					lexicon_count[f] = 0

		lexicon_out = {}
		for f, count in lexicon_count.items():
			if count > threshold:
				lexicon_out[f] = len(lexicon_out) + 1
		return lexicon_out

	def ExtractFeatureFromTuple(self, feature_tuple):
		if len(feature_tuple) != len(self.feature_list):
			self.appLogger.error('size of feature_tuple and the feature_list mismatch! %d : %d' (len(feature_tuple), len(feature_list)))
			raise Exception('size of feature_tuple and the feature_list mismatch! %d : %d' (len(feature_tuple), len(feature_list)))
		feature_vector = {}
		for i, feature in enumerate(self.feature_list):
			if feature == 'TOPIC':
				for f in feature_tuple[i]:
					if f in self.TOPIC_LEX:
						idx = self.TOPIC_LEX_offset + self.TOPIC_LEX[f]
						if idx in feature_vector:
							feature_vector[idx] += 1
						else:
							feature_vector[idx] = 1
			elif feature == 'BASELINE':
				for f in feature_tuple[i]:
					if f in self.BASELINE_LEX:
						idx = self.BASELINE_LEX_offset + self.BASELINE_LEX[f]
						if idx in feature_vector:
							feature_vector[idx] += 1
						else:
							feature_vector[idx] = 1
			elif feature.startswith('NGRAM'):
				sents = feature_tuple[i]
				for sent in sents:
					tokens = self._preprocessing(sent)
					if self.unigram:
						for tk in self.ngram_builder.GenerateNGRAM(tokens,1):
							if tk in self.UNI_LEX:
								idx = self.UNI_LEX_offset + self.UNI_LEX[tk]
								weight = self.UNI_LEX_weight[tk]
								if idx in feature_vector:
									feature_vector[idx] += weight
								else:
									feature_vector[idx] = weight
					if self.bigram:
						for tk in self.ngram_builder.GenerateNGRAM(tokens,2):
							if tk in self.BI_LEX:
								idx = self.BI_LEX_offset + self.BI_LEX[tk]
								weight = self.BI_LEX_weight[tk]
								if idx in feature_vector:
									feature_vector[idx] += weight
								else:
									feature_vector[idx] = weight

					if self.trigram:
						for tk in self.ngram_builder.GenerateNGRAM(tokens,3):
							tk = '%s, %s, %s'%(tokens[j],tokens[j+1],tokens[j+2])
							if key in self.TRI_LEX:
								idx = self.TRI_LEX_offset + self.TRI_LEX[tk]
								weight = self.TRI_LEX_weight[tk]
								if idx in feature_vector:
									feature_vector[idx] += weight
								else:
									feature_vector[idx] = weight
			elif feature == 'VALUE_MATCH':
				temp_feature_vec = {}
				topic = feature_tuple[i][0]
				sents = feature_tuple[i][1]
				for sent in sents:
					f = self.ValueMatchFeature.extract_trans_feature(sent,topic)
					temp_feature_vec = self.ValueMatchFeature.Merge2Features(temp_feature_vec, f)
				for idx, value in temp_feature_vec.items():
					feature_vector[idx + self.VMF_offset] = value

		return feature_vector
	def Stat_Lexicon(self, train_samples, label_samples,  feature_list = ['TOPIC', 'NGRAM_u:b', 'BASELINE', 'VALUE_MATCH']):
		'''
		train samples is a list of samples
		each item is a list , each item of the list is correspond to the feature list
		'''
		if len(train_samples) != len(label_samples):
			self.appLogger.error('Error: size of train samples and label samples mismatch! %d : %d' %(len(train_samples), len(label_samples)))
			raise Exception('Error: size of train samples and label samples mismatch! %d : %d' %(len(train_samples), len(label_samples)))
		if len(train_samples) == 0:
			self.appLogger.error('Error: No samples!')
			raise Exception('Error: No samples!')

		self.feature_list = feature_list
		sample_field_num = len(train_samples[0])
		if sample_field_num != len(self.feature_list):
			self.appLogger.error('Error: size of sample field num and feature list mismatch! %d : %d' %(sample_field_num, len(self.feature_list)))
			raise Exception('Error: size of sample field num and feature list mismatch! %d : %d' %(sample_field_num, len(self.feature_list)))
		'''
		print feature_list
		print train_samples[0]
		print label_samples[0]
		'''

		for feature in feature_list:
			if feature.startswith('NGRAM'):
				ngram_feature = feature[6:]
				tokens = ngram_feature.split(':')
				for t in tokens:
					if t == 'u':
						self.unigram = True
						continue
					elif t == 'b':
						self.bigram = True
						continue
					elif t == 't':
						self.trigram = True
						continue
					else:
						self.appLogger.error('Unknown ngram feature! %s' %(ngram_feature))
						raise Exception('Unknown ngram feature! %s' %(ngram_feature))

		for i, feature in enumerate(self.feature_list):
			if feature == 'TOPIC':
				#print i
				topic_samples = [train_sample[i] for train_sample in train_samples]
				#print topic_samples[0:3]
				self.TOPIC_LEX = self._stat_lexicon(topic_samples, threshold = 0)
			elif feature == 'BASELINE':
				#print i
				baseline_samples = [train_sample[i] for train_sample in train_samples]
				#print baseline_samples[0:3]
				self.BASELINE_LEX = self._stat_lexicon(baseline_samples, threshold = 0)
			elif feature.startswith('NGRAM'):
				#print i
				sent_samples = [train_sample[i] for train_sample in train_samples]
				#print sent_samples[0:3]
				unigram_lists = []
				bigram_lists = []
				trigram_lists = []
				for sents in sent_samples:
					for sent in sents:
						#print sent
						tokens = self._preprocessing(sent)
						if self.unigram:
							unigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,1))
						if self.bigram:
							bigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,2))
						if self.trigram:
							trigram_lists.append(self.ngram_builder.GenerateNGRAM(tokens,3))

				if self.unigram:
					self.UNI_LEX = self._stat_lexicon(unigram_lists, threshold=2)
					self.UNI_LEX_weight = self._calc_feature_weight(unigram_lists, label_samples, self.UNI_LEX, 'simple')
				if self.bigram:
					self.BI_LEX = self._stat_lexicon(bigram_lists, threshold=2)
					self.BI_LEX_weight = self._calc_feature_weight(bigram_lists, label_samples, self.BI_LEX,'simple')
				if self.trigram:
					self.TRI_LEX = self._stat_lexicon(trigram_lists, threshold=2)
					self.TRI_LEX_weight = self._calc_feature_weight(trigram_lists, label_samples, self.TRI_LEX,'simple')
			elif feature == 'VALUE_MATCH':
				self.ValueMatchFeature = ValueMatchFeature(self.tagsets)
			else:
				self.appLogger.error('Unknown feature! %s' %(feature))
				raise Exception('Unknown feature! %s' %(feature))
		return