示例#1
0
	def _sklearn2weka(self, features, labels=None):

		encoder = CategoricalEncoder(encoding='ordinal')
		labels_nominal = encoder.fit_transform(np.array(labels).reshape(-1, 1))

		if not hasattr(self, 'dict') and labels is not None:

			dict = {}

			for label, nominal in zip(labels, labels_nominal):
				if nominal.item(0) not in dict:
					dict[nominal.item(0)] = label

			self._dict = dict

		labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1])

		weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset')
		weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1])

		if labels is not None:
			for index, inst in enumerate(weka_dataset):
				inst.set_value(features.shape[1], labels_column[index])
				weka_dataset.set_instance(index,inst)

		return weka_dataset
示例#2
0
def encode_cat(dat):
    cat_encoder = CategoricalEncoder(encoding='onehot-dense')
    dat = dat.astype('str')
    dat_reshaped = dat.values.reshape(-1, 1)
    dat_1hot = cat_encoder.fit_transform(dat_reshaped)
    col_names = [
        dat.name + '_' + str(x) for x in list(cat_encoder.categories_[0])
    ]
    return pd.DataFrame(dat_1hot, columns=col_names)
示例#3
0
def encode_cat(dat):
    """ functon to return a labeled data frame with one hot encoding """
    cat_encoder = CategoricalEncoder(encoding="onehot-dense")
    dat = dat.astype('str')
    dat_reshaped = dat.values.reshape(-1, 1)
    dat_1hot = cat_encoder.fit_transform(dat_reshaped)
    col_names = [
        dat.name + "_" + str(x) for x in list(cat_encoder.categories_[0])
    ]
    return pd.DataFrame(dat_1hot, columns=col_names)
def convert_categorical_features(df):
    enc = CategoricalEncoder(encoding='ordinal')
    encoded_features = enc.fit_transform(df[[
        'dim_is_requested', 'dim_market', 'dim_room_type', 'cancel_policy',
        'dim_is_instant_bookable'
    ]])

    encoded_df = pd.DataFrame(encoded_features,
                              index=df.index,
                              columns=[
                                  'dim_is_requested', 'dim_market',
                                  'dim_room_type', 'cancel_policy',
                                  'dim_is_instant_bookable'
                              ])

    col = df.columns.tolist()
    col_non_cat = col[1:3] + col[5:6] + col[7:10] + col[11:]
    df_non_cat = df[col_non_cat]
    col_cat = encoded_df.columns.tolist()
    col_full = col_cat[:] + col_non_cat[:]

    stack_full = np.column_stack([encoded_df, df_non_cat])
    stack_df = pd.DataFrame(stack_full, index=df.index, columns=col_full)
    return stack_df
示例#5
0
文件: exercise3.py 项目: starnex/ML
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1))
housing_cat_1hot

# In[38]:

housing_cat_1hot.toarray()

# In[39]:

from sklearn.preprocessing import CategoricalEncoder
cat_encoder = CategoricalEncoder()
housing_cat_reshaped = housing_cat.values.reshape(-1, 1)
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot

# In[40]:

cat_encoder = CategoricalEncoder(encoding="onehot-dense")
housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped)
housing_cat_1hot

# In[41]:

cat_encoder.categories_

# In[42]:

from sklearn.base import BaseEstimator, TransformerMixin
示例#6
0
# In[4]:

enc = CategoricalEncoder(encoding='onehot-dense')

X_2 = np.array(X[:, 0].reshape(-1, 1))
Xq_2 = np.array(X_q[:, 0].reshape(-1, 1))
attributes = [dataset['attributes'][0][0]]

for i, (name, relation) in enumerate(dataset['attributes'][1:-1]):
    if relation == 'NUMERIC':
        X_2 = np.hstack((X_2, X[:, i + 1].reshape(-1, 1)))
        Xq_2 = np.hstack((Xq_2, X_q[:, i + 1].reshape(-1, 1)))
        attributes.append(name)
        continue

    X_2 = np.hstack((X_2, enc.fit_transform(X[:, i + 1].reshape(-1, 1))))
    Xq_2 = np.hstack((Xq_2, enc.transform(X_q[:, i + 1].reshape(-1, 1))))

    for category in enc.categories_[0]:
        attributes.append(category)

X = X_2.astype(float)
X_q = Xq_2.astype(float)

print('Num features: %d' % len(attributes))
print(attributes)

# We now have 51 features, for example feature entrepreneur can get value 0 or 1, 0 meaning persion is not entrepreneur and 1 meaning he is.

# ### Most informative features
# Before we use PCA to remove some features we will see which features are considered most informative when we use Logistic Regression classifier.
示例#7
0
	def kfold_validation(self, k=10):

		sem.acquire()

		available_ram = psutil.virtual_memory()[1]
		available_ram = int(int(available_ram) * .9 * 1e-9)

		if available_ram > 5:
			jvm.start(max_heap_size='5g')
		else:
			jvm.start(max_heap_size=str(available_ram)+'g')

		###

		print('\nCaricando '+self.input_file+' con opts -f'+str(self.features_number)+' -c'+self.classifier_name+'\n')
		# load .arff file
		dataset = arff.load(open(self.input_file, 'r'))
		data = np.array(dataset['data'])

		self.features_names = [x[0] for x in dataset['attributes']]

		self.attributes_number = data.shape[1]
		self.dataset_features_number = self.attributes_number - self.levels_number

		# Factorization of Nominal features_index
		encoder = CategoricalEncoder(encoding='ordinal')
		nominal_features_index = [i for i in range(len(dataset['attributes'][:-self.levels_number])) if dataset['attributes'][i][1] != u'NUMERIC']
		if len(nominal_features_index) > 0:
			data[:, nominal_features_index] = encoder.fit_transform(
				data[:, nominal_features_index])

		# Impute missing value by fitting over training set and transforming both sets
		imp = SimpleImputer(missing_values='NaN', strategy='most_frequent')
		data[:, :self.dataset_features_number] = imp.fit_transform(data[:, :self.dataset_features_number])

		classifiers_per_fold = []
		oracles_per_fold = []
		predictions_per_fold = []
		predictions_per_fold_all = []

		print('\n***\nStart testing with '+str(k)+'Fold cross-validation -f'+str(self.features_number)+' -c'+self.classifier_name+'\n***\n')

		bar = progressbar.ProgressBar(maxval=k, widgets=[progressbar.Bar(
			'=', '[', ']'), ' ', progressbar.Percentage()])
		bar.start()

		skf = StratifiedKFold(n_splits=k, shuffle=True)
		bar_cnt = 0

		for train_index, test_index in skf.split(data, data[:,self.attributes_number-1]):

			self.classifiers = []

			self.training_set = data[train_index, :self.dataset_features_number]
			self.testing_set = data[test_index, :self.dataset_features_number]
			self.ground_through = data[train_index, self.dataset_features_number:]
			self.oracle = data[test_index, self.dataset_features_number:]
			self.prediction = np.ndarray(shape=[len(test_index),self.levels_number],dtype='<U24')
			self.prediction_all = np.ndarray(shape=[len(test_index),self.levels_number],dtype='<U24')

			root = Tree()

			root.train_index = [i for i in range(self.training_set.shape[0])]
			root.test_index = [i for i in range(self.testing_set.shape[0])]
			root.test_index_all = root.test_index
			root.children_tags = list(set(self.ground_through[root.train_index, root.level]))
			root.children_number = len(root.children_tags)

			if self.has_config:
				if 'f' in config[root.tag + '_' + str(root.level + 1)]:
					root.features_number = config[root.tag + '_' + str(root.level + 1)]['f']
				elif 'p' in config[root.tag + '_' + str(root.level + 1)]:
					root.packets_number = config[root.tag + '_' + str(root.level + 1)]['p']
				root.classifier_name = config[root.tag + '_' + str(root.level + 1)]['c']

				print('config','tag',root.tag,'level',root.level,'f',root.features_number,'c',root.classifier_name)
			else:
				root.features_number = self.features_number
				root.packets_number = self.packets_number
				root.classifier_name = self.classifier_name

			self.classifiers.append(root)

			if root.children_number > 1:

				classifier_to_call = getattr(self, supported_classifiers[root.classifier_name])
				classifier_to_call(node=root)

			else:

				self.unary_class_results_inferring(root)

			# Creating hierarchy recursively
			if root.level < self.levels_number-1 and root.children_number > 0:
				self.recursive(root)

			classifiers_per_fold.append(self.classifiers)

			oracles_per_fold.append(self.oracle)
			predictions_per_fold.append(self.prediction)
			predictions_per_fold_all.append(self.prediction_all)

			bar_cnt += 1
			bar.update(bar_cnt)

		bar.finish()

		folder_discriminator = self.classifier_name

		if self.has_config:
			folder_discriminator = self.config_name

		material_folder = './data_'+folder_discriminator+'/material/'

		if not os.path.exists('./data_'+folder_discriminator):
			os.makedirs('./data_'+folder_discriminator)
			os.makedirs(material_folder)
		elif not os.path.exists(material_folder):
			os.makedirs(material_folder)

		type_discr = 'flow'
		feat_discr = '_f_' + str(self.features_number)

		if not self.has_config and self.packets_number != 0:
			type_discr = 'early'
			feat_discr = '_p_' + str(self.packets_number)
		elif self.has_config:
			if 'p' in self.config:
				type_discr = 'early'
			feat_discr = '_c_' + self.config_name

		material_features_folder = './data_'+folder_discriminator+'/material/features/'

		if not os.path.exists(material_folder):
			os.makedirs(material_folder)
			os.makedirs(material_features_folder)
		elif not os.path.exists(material_features_folder):
			os.makedirs(material_features_folder)

		for i in range(self.levels_number):

			file = open(material_folder + 'multi_' + type_discr + '_level_' + str(i+1) + feat_discr + '.dat', 'w+')
			file.close()

			for j in range(k):

				file = open(material_folder + 'multi_' + type_discr + '_level_' + str(i+1) + feat_discr + '.dat', 'a')

				file.write('@fold\n')
				for o, p in zip(oracles_per_fold[j][:,i], predictions_per_fold[j][:,i]):
					file.write(str(o)+' '+str(p)+'\n')

				file.close()

		# Inferring NW metrics per classifier

		for classifier in classifiers_per_fold[0]:

			file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '.dat', 'w+')
			file.close()

			file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_all.dat', 'w+')
			file.close()

			file = open(material_features_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_features.dat', 'w+')
			file.close()

		for fold_n, classifiers in enumerate(classifiers_per_fold):

			for classifier in classifiers:

				file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '.dat', 'a')

				if classifier.level > 0:
					index = []

					for pred_n, prediction in enumerate(predictions_per_fold[fold_n][classifier.test_index, classifier.level-1]):
						if prediction == oracles_per_fold[fold_n][classifier.test_index[pred_n], classifier.level-1]:
							index.append(classifier.test_index[pred_n])

					prediction_nw = predictions_per_fold[fold_n][index, classifier.level]
					oracle_nw = oracles_per_fold[fold_n][index, classifier.level]
				else:
					prediction_nw = predictions_per_fold[fold_n][classifier.test_index, classifier.level]
					oracle_nw = oracles_per_fold[fold_n][classifier.test_index, classifier.level]

				file.write('@fold\n')
				for o, p in zip(oracle_nw, prediction_nw):
						file.write(str(o)+' '+str(p)+'\n')

				file.close()

				file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_all.dat', 'a')

				if classifier.level > 0:
					index = []

					for pred_n, prediction in enumerate(predictions_per_fold_all[fold_n][classifier.test_index, classifier.level-1]):
						if prediction == oracles_per_fold[fold_n][classifier.test_index[pred_n], classifier.level-1]:
							index.append(classifier.test_index[pred_n])

					prediction_all = predictions_per_fold_all[fold_n][index, classifier.level]
					oracle_all = oracles_per_fold[fold_n][index, classifier.level]
				else:
					prediction_all = predictions_per_fold_all[fold_n][classifier.test_index_all, classifier.level]
					oracle_all = oracles_per_fold_all[fold_n][classifier.test_index_all, classifier.level]

				file.write('@fold\n')
				for o, p in zip(oracle_all, prediction_all):
						file.write(str(o)+' '+str(p)+'\n')

				file.close()

				file = open(material_features_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_features.dat', 'a')

				file.write('@fold\n')
				file.write(self.features_names[classifier.features_index[0]])

				for feature_index in classifier.features_index[1:]:
					file.write(','+self.features_names[feature_index])

				file.write('\n')

				file.close()

		graph_folder = './data_'+folder_discriminator+'/graph/'

		if not os.path.exists('./data_'+folder_discriminator):
			os.makedirs('./data_'+folder_discriminator)
			os.makedirs(graph_folder)
		elif not os.path.exists(graph_folder):
			os.makedirs(graph_folder)

		# Graph plot
		G = nx.DiGraph()
		for info in classifiers_per_fold[0]:
			G.add_node(str(info.level)+' '+info.tag, level=info.level,
					   tag=info.tag, children_tags=info.children_tags)
		for node_parent, data_parent in G.nodes.items():
			for node_child, data_child in G.nodes.items():
				if data_child['level']-data_parent['level'] == 1 and any(data_child['tag'] in s for s in data_parent['children_tags']):
					G.add_edge(node_parent, node_child)
		nx.write_gpickle(G, graph_folder+'multi_' + type_discr + feat_discr +'_graph.gml')

		###

		jvm.stop()

		sem.release()
示例#8
0
	def kfold_validation(self, k=10):

		sem.acquire()

		available_ram = psutil.virtual_memory()[1]
		available_ram = int(int(available_ram) * .9 * 1e-9)

		if available_ram > 5:
			jvm.start(max_heap_size='5g')
		else:
			jvm.start(max_heap_size=str(available_ram)+'g')

		###

		print('\nCaricando '+self.input_file+' con opts -f'+str(self.features_number)+' -c'+self.classifier_name+'\n')
		# load .arff file
		dataset = arff.load(open(input_file, 'r'))
		data = np.array(dataset['data'])

		self.features_names = [x[0] for x in dataset['attributes']]

		self.attributes_number = data.shape[1]
		self.dataset_features_number = self.attributes_number - self.levels_number

		# Factorization of Nominal features_index
		encoder = CategoricalEncoder(encoding='ordinal')
		nominal_features_index = [i for i in range(len(dataset['attributes'][:-self.levels_number])) if dataset['attributes'][i][1] != u'NUMERIC']
		if len(nominal_features_index) > 0:
			data[:, nominal_features_index] = encoder.fit_transform(
				data[:, nominal_features_index])

		# Impute missing value by fitting over training set and transforming both sets
		imp = SimpleImputer(missing_values='NaN', strategy='most_frequent')
		data[:, :self.dataset_features_number] = imp.fit_transform(
			data[:, :self.dataset_features_number])

		prediction = []
		probability = []
		oracle = []

		print('\n***\nStart testing with ' + str(k)+'Fold cross-validation -f'+str(self.features_number)+' -c'+self.classifier_name+'\n***\n')

		bar = progressbar.ProgressBar(maxval=k, widgets=[progressbar.Bar(
			'=', '[', ']'), ' ', progressbar.Percentage()])
		bar.start()

		temp_metrics = []

		skf = StratifiedKFold(n_splits=k, shuffle=True)
		bar_cnt = 0
		for train_index, test_index in skf.split(data, data[:, self.dataset_features_number + self.tag_under_test]):

			self.training_set = data[train_index, :self.dataset_features_number]
			self.testing_set = data[test_index, :self.dataset_features_number]
			self.ground_through = data[train_index,
									   self.dataset_features_number + self.tag_under_test]
			self.oracle = data[test_index,
							   self.dataset_features_number + self.tag_under_test]
			self.prediction = np.ndarray(
				shape=[len(test_index), 1], dtype='<U24')
			self.probability = np.ndarray(
				shape=[len(test_index), len(set(self.ground_through))], dtype='<U24')

			classifier_to_call = getattr(self, supported_classifiers[self.classifier_name])
			classifier_to_call()

			prediction.append(self.prediction)
			probability.append(self.probability)
			oracle.append(self.oracle)

			# print(type(prediction[bar_cnt]))
			# print(type(probability[bar_cnt]))

			bar_cnt += 1
			bar.update(bar_cnt)

		bar.finish()

		relations = []

		relations = []
		relations.append({  # Lv2:Lv1
			u'Tor': u'Tor',
			u'TorPT': u'Tor',
			u'TorApp': u'Tor',
			u'I2PApp80BW': u'I2P',
			u'I2PApp0BW': u'I2P',
			u'I2PApp': u'I2P',
			u'JonDonym': u'JonDonym'
		})

		relations.append({  # Lv3:Lv2
			u'JonDonym': u'JonDonym',
			u'I2PSNARK_App80BW': u'I2PApp80BW',
			u'IRC_App80BW': u'I2PApp80BW',
			u'Eepsites_App80BW': u'I2PApp80BW',
			u'I2PSNARK_App0BW': u'I2PApp0BW',
			u'IRC_App0BW': u'I2PApp0BW',
			u'Eepsites_App0BW': u'I2PApp0BW',
			u'I2PSNARK_App': u'I2PApp',
			u'IRC_App': u'I2PApp',
			u'Eepsites_App': u'I2PApp',
			u'ExploratoryTunnels_App': u'I2PApp',
			u'ParticipatingTunnels_App': u'I2PApp',
			u'Tor': u'Tor',
			u'Streaming': u'TorApp',
			u'Torrent': u'TorApp',
			u'Browsing': u'TorApp',
			u'Flashproxy': u'TorPT',
			u'FTE': u'TorPT',
			u'Meek': u'TorPT',
			u'Obfs3': u'TorPT',
			u'scramblesuit': u'TorPT'
		})

		oracle_inferred = []
		prediction_inferred = []

		for i in range(self.tag_under_test):
			oracle_inferred.append(list())
			prediction_inferred.append(list())

		# Infering superior levels
		for i in range(k):
			# Assign of prediction to a dummy to use this one in consecutive label swaps
			inferred_prediction = prediction[i].copy()
			inferred_oracle = oracle[i].copy()
			for j in reversed(range(self.tag_under_test)):
				inferred_oracle = np.vectorize(
					relations[j].get)(list(inferred_oracle))
				inferred_prediction = np.vectorize(
					relations[j].get)(list(inferred_prediction))
				oracle_inferred[j].append(inferred_oracle)
				prediction_inferred[j].append(inferred_prediction)
		print('\n***\nStart testing with incremental gamma threshold\n***\n')

		bar = progressbar.ProgressBar(maxval=9, widgets=[progressbar.Bar(
			'=', '[', ']'), ' ', progressbar.Percentage()])
		bar.start()

		oracle_gamma = []
		prediction_gamma = []
		classified_ratio = []

		for i in range(9):
			gamma = float(i+1)/10.0

			oracle_gamma.append(list())
			prediction_gamma.append(list())
			classified_ratio.append(list())

			for j in range(k):
				indexes = []
				p_cnt = 0
				for p in probability[j]:
					if max(p) < gamma:
						indexes.append(p_cnt)
					p_cnt += 1
				gamma_oracle = np.delete(oracle[j], [indexes])
				gamma_prediction = np.delete(prediction[j], [indexes])
				oracle_gamma[i].append(gamma_oracle)
				prediction_gamma[i].append(gamma_prediction)
				classified_ratio[i].append(
					float(len(gamma_prediction))/float(len(prediction[j])))

			bar.update(i)

		bar.finish()

		data_folder = './data_'+self.classifier_name+'/material/'

		if not os.path.exists('./data_'+self.classifier_name):
			os.makedirs('./data_'+self.classifier_name)
			os.makedirs(data_folder)
		elif not os.path.exists(data_folder):
			os.makedirs(data_folder)

		if self.packets_number != 0:
			file = open(data_folder+'flat_early_level_'+str(self.level_target) +
						'_p_'+str(self.packets_number)+'.dat', 'w+')
		else:
			file = open(data_folder+'flat_flow_level_'+str(self.level_target) +
						'_f_'+str(self.features_number)+'.dat', 'w+')

		for i in range(k):
			file.write('@fold\n')
			for o, p in zip(oracle[i], prediction[i]):
				file.write(str(o)+' '+str(p)+'\n')

		file.close()

		for i in range(self.tag_under_test):

			if self.packets_number != 0:
				file = open(data_folder+'flat_early_level_'+str(self.level_target) +
							'_p_'+str(self.packets_number)+'_inferred_'+str(i+1)+'.dat', 'w+')
			else:
				file = open(data_folder+'flat_flow_level_'+str(self.level_target) +
							'_f_'+str(self.features_number)+'_inferred_'+str(i+1)+'.dat', 'w+')

			for j in range(k):
				file.write('@fold\n')
				for o, p in zip(oracle_inferred[i][j], prediction_inferred[i][j]):
					file.write(str(o)+' '+str(p)+'\n')

			file.close()

		for i in range(9):
			if self.packets_number != 0:
				file = open(data_folder+'flat_early_level_'+str(self.level_target)+'_p_' +
							str(self.packets_number)+'_gamma_'+str(float(i+1)/10.0)+'.dat', 'w+')
			else:
				file = open(data_folder+'flat_flow_level_'+str(self.level_target)+'_f_' +
							str(self.features_number)+'_gamma_'+str(float(i+1)/10.0)+'.dat', 'w+')

			for j in range(k):
				file.write('@fold_cr\n')
				file.write(str(classified_ratio[i][j])+'\n')
				for o, p in zip(oracle_gamma[i][j], prediction_gamma[i][j]):
					file.write(str(o)+' '+str(p)+'\n')

			file.close()

		###

		jvm.stop()

		sem.release()
# as well as the technician part of this category.

#########################################################################
# Encoding categorical data using SimilarityEncoder
# -------------------------------------------------
#
# A typical data-science workflow uses one-hot encoding to represent
# categories.
from sklearn.preprocessing import CategoricalEncoder

# encoding simply a subset of the observations
n_obs = 20
employee_position_titles = values['Employee Position Title'].head(
    n_obs).to_frame()
categorical_encoder = CategoricalEncoder(encoding='onehot-dense')
one_hot_encoded = categorical_encoder.fit_transform(employee_position_titles)
f3, ax3 = plt.subplots(figsize=(6, 6))
ax3.matshow(one_hot_encoded)
ax3.set_title('Employee Position Title values, one-hot encoded')
ax3.axis('off')
f3.tight_layout()

#########################################################################
# The corresponding is very sparse
#
# SimilarityEncoder can be used to replace one-hot encoding capturing the
# similarities:

f4, ax4 = plt.subplots(figsize=(6, 6))
similarity_encoded = similarity_encoder.fit_transform(employee_position_titles)
ax4.matshow(similarity_encoded)
示例#10
0
def extract_features(df_train,
                     df_inference,
                     selected_feature_names_categ,
                     selected_feature_names_interval,
                     shuffle=True,
                     fuzzy_matching=True,
                     use_onehot=True,
                     use_sentence_vec=False):

    features_to_use = []
    variable_types = []

    if not (use_onehot):
        for feature in selected_feature_names_categ:
            features_to_use.append(feature + '_encoded')
            variable_types.append('categorical_nominal')

    # Append interval AFTER categorical!!
    for feature in selected_feature_names_interval:
        features_to_use.append(feature + '_normed')
        variable_types.append('numerical')

    # Check to ensure all cols exist (avoid keyerrors)
    for df in [df_train, df_inference]:
        df[selected_feature_names_categ + selected_feature_names_interval]
        print(df['combined_str'])

    # for feature in selected_feature_names_categ:
    #     le = preprocessing.LabelEncoder()
    #     print(print_attr_overview(df[feature], True, topn=10))
    #     df[feature + '_encoded'] = le.fit_transform(df[feature])
    #     features_to_use.append(feature + '_encoded')

    if use_onehot:
        # Each Feature has its own vocab...
        vocabs = defaultdict(list)

        X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]])
        X = df_train[colnames_categ]
        X = X.apply(preprocess_categ_series)

        enc = CategoricalEncoder(handle_unknown='ignore')
        enc.fit_transform(X)

        # pprint(enc.categories_)

    else:
        le = preprocessing.LabelEncoder()
        all_unique = []

        # FIT LABEL_ENCODER (combine vocabs for train and inference)
        for df in [df_train, df_inference]:
            for feature in selected_feature_names_categ:
                # print(print_attr_overview(df[feature]))

                s = df[feature]

                # Remove categorical entries with less than 10 occurances
                a = s.value_counts()
                s[s.isin(a.index[a < 12])] = np.nan

                s[s.isnull()] = "EMPTY_PLACEHOLDER"
                s = s.map(lambda x: x.lower() if type(x) == str else x)
                # print(np.unique(df[feature]))
                all_unique.extend(np.unique(s))

        le.fit(all_unique)

        # TRANSFORM LABEL_ENCODER
        for df in [df_train, df_inference]:
            for feature in selected_feature_names_categ:
                print(feature)
                # print(df[feature])
                s = df[feature]

                s = s.map(lambda x: x.lower() if type(x) == str else x)
                df[feature + '_encoded'] = le.transform(s)
                print(feature, len(np.unique(s)))

    for df in [df_train, df_inference]:
        for feature in selected_feature_names_interval:
            s = df[feature]
            s = s.map(lambda x: x.replace(',', '') if type(x) == str else x)
            # print(s)
            s = pd.to_numeric(s, errors='coerce')

            # Set null values to zero
            # TODO: try set nan to the mean instead of zero
            # TODO: try different types of normalisation
            s[np.logical_not(s.notnull())] = 0.0

            df[feature + '_normed'] = norm_zscore(s)

    # features_to_use.append('sentence_vec')
    # variable_types.append('embedding')

    if use_sentence_vec:
        from ft_embedding import get_sentence_vec
        print('Computing sentence vectors for dataset')
        train_embedding_mat = np.asarray(
            [get_sentence_vec(x) for x in df_train['combined_str']])
        inference_embedding_mat = np.asarray(
            [get_sentence_vec(x) for x in df_inference['combined_str']])
        variable_types.append('ft_embedding')

    if use_onehot:
        print(features_to_use)

        # One-Hot Categorical Encoding
        train_X_onehot = enc.transform(df_train[colnames_categ]).toarray()
        train_X_interval = df_train[features_to_use].as_matrix()
        print(train_X_onehot.shape)
        print(train_X_interval.shape)
        train_X = np.hstack([train_X_onehot, train_X_interval])

        inference_X_onehot = enc.transform(
            df_inference[colnames_categ]).toarray()
        inference_X_interval = df_inference[features_to_use].as_matrix()
        print(inference_X_onehot.shape)
        print(inference_X_interval.shape)
        inference_X = np.hstack([inference_X_onehot, inference_X_interval])

        # Add (one-hot encoded) numerical features to variable_types
        len_onehot = train_X_onehot.shape[1]
        print(len_onehot)
        features_to_use = ['numerical'
                           for i in range(len_onehot)] + features_to_use

    else:
        # Index Categorical Encoding (integer)
        train_X = df_train[features_to_use].as_matrix()
        inference_X = df_inference[features_to_use].as_matrix()

    train_y = df_train['case_status'].as_matrix()

    if use_sentence_vec:
        # Stack with sentence embedding
        train_X = np.hstack([train_X.copy(), train_embedding_mat])
        inference_X = np.hstack([inference_X.copy(), inference_embedding_mat])
        print(train_embedding_mat.shape)
        print(inference_embedding_mat.shape)

    print(train_X.shape)
    print(inference_X.shape)
    # exit()
    inference_row_id = df_inference['row ID']

    if shuffle:
        train_X, train_y = skl_shuffle(train_X, train_y)

    # print(X.shape)
    # print(y.shape)

    if use_onehot:
        vocab_size = 0
    else:
        vocab_size = len(list(le.classes_))

    return train_X, train_y, inference_row_id, inference_X, vocab_size, variable_types, features_to_use