def _sklearn2weka(self, features, labels=None): encoder = CategoricalEncoder(encoding='ordinal') labels_nominal = encoder.fit_transform(np.array(labels).reshape(-1, 1)) if not hasattr(self, 'dict') and labels is not None: dict = {} for label, nominal in zip(labels, labels_nominal): if nominal.item(0) not in dict: dict[nominal.item(0)] = label self._dict = dict labels_column = np.reshape(labels_nominal,[labels_nominal.shape[0], 1]) weka_dataset = ndarray_to_instances(np.ascontiguousarray(features, dtype=np.float_), 'weka_dataset') weka_dataset.insert_attribute(Attribute.create_nominal('tag', [str(float(i)) for i in range(len(self._dict))]), features.shape[1]) if labels is not None: for index, inst in enumerate(weka_dataset): inst.set_value(features.shape[1], labels_column[index]) weka_dataset.set_instance(index,inst) return weka_dataset
def encode_cat(dat): cat_encoder = CategoricalEncoder(encoding='onehot-dense') dat = dat.astype('str') dat_reshaped = dat.values.reshape(-1, 1) dat_1hot = cat_encoder.fit_transform(dat_reshaped) col_names = [ dat.name + '_' + str(x) for x in list(cat_encoder.categories_[0]) ] return pd.DataFrame(dat_1hot, columns=col_names)
def encode_cat(dat): """ functon to return a labeled data frame with one hot encoding """ cat_encoder = CategoricalEncoder(encoding="onehot-dense") dat = dat.astype('str') dat_reshaped = dat.values.reshape(-1, 1) dat_1hot = cat_encoder.fit_transform(dat_reshaped) col_names = [ dat.name + "_" + str(x) for x in list(cat_encoder.categories_[0]) ] return pd.DataFrame(dat_1hot, columns=col_names)
def convert_categorical_features(df): enc = CategoricalEncoder(encoding='ordinal') encoded_features = enc.fit_transform(df[[ 'dim_is_requested', 'dim_market', 'dim_room_type', 'cancel_policy', 'dim_is_instant_bookable' ]]) encoded_df = pd.DataFrame(encoded_features, index=df.index, columns=[ 'dim_is_requested', 'dim_market', 'dim_room_type', 'cancel_policy', 'dim_is_instant_bookable' ]) col = df.columns.tolist() col_non_cat = col[1:3] + col[5:6] + col[7:10] + col[11:] df_non_cat = df[col_non_cat] col_cat = encoded_df.columns.tolist() col_full = col_cat[:] + col_non_cat[:] stack_full = np.column_stack([encoded_df, df_non_cat]) stack_df = pd.DataFrame(stack_full, index=df.index, columns=col_full) return stack_df
from sklearn.preprocessing import OneHotEncoder encoder = OneHotEncoder() housing_cat_1hot = encoder.fit_transform(housing_cat_encoded.reshape(-1, 1)) housing_cat_1hot # In[38]: housing_cat_1hot.toarray() # In[39]: from sklearn.preprocessing import CategoricalEncoder cat_encoder = CategoricalEncoder() housing_cat_reshaped = housing_cat.values.reshape(-1, 1) housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped) housing_cat_1hot # In[40]: cat_encoder = CategoricalEncoder(encoding="onehot-dense") housing_cat_1hot = cat_encoder.fit_transform(housing_cat_reshaped) housing_cat_1hot # In[41]: cat_encoder.categories_ # In[42]: from sklearn.base import BaseEstimator, TransformerMixin
# In[4]: enc = CategoricalEncoder(encoding='onehot-dense') X_2 = np.array(X[:, 0].reshape(-1, 1)) Xq_2 = np.array(X_q[:, 0].reshape(-1, 1)) attributes = [dataset['attributes'][0][0]] for i, (name, relation) in enumerate(dataset['attributes'][1:-1]): if relation == 'NUMERIC': X_2 = np.hstack((X_2, X[:, i + 1].reshape(-1, 1))) Xq_2 = np.hstack((Xq_2, X_q[:, i + 1].reshape(-1, 1))) attributes.append(name) continue X_2 = np.hstack((X_2, enc.fit_transform(X[:, i + 1].reshape(-1, 1)))) Xq_2 = np.hstack((Xq_2, enc.transform(X_q[:, i + 1].reshape(-1, 1)))) for category in enc.categories_[0]: attributes.append(category) X = X_2.astype(float) X_q = Xq_2.astype(float) print('Num features: %d' % len(attributes)) print(attributes) # We now have 51 features, for example feature entrepreneur can get value 0 or 1, 0 meaning persion is not entrepreneur and 1 meaning he is. # ### Most informative features # Before we use PCA to remove some features we will see which features are considered most informative when we use Logistic Regression classifier.
def kfold_validation(self, k=10): sem.acquire() available_ram = psutil.virtual_memory()[1] available_ram = int(int(available_ram) * .9 * 1e-9) if available_ram > 5: jvm.start(max_heap_size='5g') else: jvm.start(max_heap_size=str(available_ram)+'g') ### print('\nCaricando '+self.input_file+' con opts -f'+str(self.features_number)+' -c'+self.classifier_name+'\n') # load .arff file dataset = arff.load(open(self.input_file, 'r')) data = np.array(dataset['data']) self.features_names = [x[0] for x in dataset['attributes']] self.attributes_number = data.shape[1] self.dataset_features_number = self.attributes_number - self.levels_number # Factorization of Nominal features_index encoder = CategoricalEncoder(encoding='ordinal') nominal_features_index = [i for i in range(len(dataset['attributes'][:-self.levels_number])) if dataset['attributes'][i][1] != u'NUMERIC'] if len(nominal_features_index) > 0: data[:, nominal_features_index] = encoder.fit_transform( data[:, nominal_features_index]) # Impute missing value by fitting over training set and transforming both sets imp = SimpleImputer(missing_values='NaN', strategy='most_frequent') data[:, :self.dataset_features_number] = imp.fit_transform(data[:, :self.dataset_features_number]) classifiers_per_fold = [] oracles_per_fold = [] predictions_per_fold = [] predictions_per_fold_all = [] print('\n***\nStart testing with '+str(k)+'Fold cross-validation -f'+str(self.features_number)+' -c'+self.classifier_name+'\n***\n') bar = progressbar.ProgressBar(maxval=k, widgets=[progressbar.Bar( '=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() skf = StratifiedKFold(n_splits=k, shuffle=True) bar_cnt = 0 for train_index, test_index in skf.split(data, data[:,self.attributes_number-1]): self.classifiers = [] self.training_set = data[train_index, :self.dataset_features_number] self.testing_set = data[test_index, :self.dataset_features_number] self.ground_through = data[train_index, self.dataset_features_number:] self.oracle = data[test_index, self.dataset_features_number:] self.prediction = np.ndarray(shape=[len(test_index),self.levels_number],dtype='<U24') self.prediction_all = np.ndarray(shape=[len(test_index),self.levels_number],dtype='<U24') root = Tree() root.train_index = [i for i in range(self.training_set.shape[0])] root.test_index = [i for i in range(self.testing_set.shape[0])] root.test_index_all = root.test_index root.children_tags = list(set(self.ground_through[root.train_index, root.level])) root.children_number = len(root.children_tags) if self.has_config: if 'f' in config[root.tag + '_' + str(root.level + 1)]: root.features_number = config[root.tag + '_' + str(root.level + 1)]['f'] elif 'p' in config[root.tag + '_' + str(root.level + 1)]: root.packets_number = config[root.tag + '_' + str(root.level + 1)]['p'] root.classifier_name = config[root.tag + '_' + str(root.level + 1)]['c'] print('config','tag',root.tag,'level',root.level,'f',root.features_number,'c',root.classifier_name) else: root.features_number = self.features_number root.packets_number = self.packets_number root.classifier_name = self.classifier_name self.classifiers.append(root) if root.children_number > 1: classifier_to_call = getattr(self, supported_classifiers[root.classifier_name]) classifier_to_call(node=root) else: self.unary_class_results_inferring(root) # Creating hierarchy recursively if root.level < self.levels_number-1 and root.children_number > 0: self.recursive(root) classifiers_per_fold.append(self.classifiers) oracles_per_fold.append(self.oracle) predictions_per_fold.append(self.prediction) predictions_per_fold_all.append(self.prediction_all) bar_cnt += 1 bar.update(bar_cnt) bar.finish() folder_discriminator = self.classifier_name if self.has_config: folder_discriminator = self.config_name material_folder = './data_'+folder_discriminator+'/material/' if not os.path.exists('./data_'+folder_discriminator): os.makedirs('./data_'+folder_discriminator) os.makedirs(material_folder) elif not os.path.exists(material_folder): os.makedirs(material_folder) type_discr = 'flow' feat_discr = '_f_' + str(self.features_number) if not self.has_config and self.packets_number != 0: type_discr = 'early' feat_discr = '_p_' + str(self.packets_number) elif self.has_config: if 'p' in self.config: type_discr = 'early' feat_discr = '_c_' + self.config_name material_features_folder = './data_'+folder_discriminator+'/material/features/' if not os.path.exists(material_folder): os.makedirs(material_folder) os.makedirs(material_features_folder) elif not os.path.exists(material_features_folder): os.makedirs(material_features_folder) for i in range(self.levels_number): file = open(material_folder + 'multi_' + type_discr + '_level_' + str(i+1) + feat_discr + '.dat', 'w+') file.close() for j in range(k): file = open(material_folder + 'multi_' + type_discr + '_level_' + str(i+1) + feat_discr + '.dat', 'a') file.write('@fold\n') for o, p in zip(oracles_per_fold[j][:,i], predictions_per_fold[j][:,i]): file.write(str(o)+' '+str(p)+'\n') file.close() # Inferring NW metrics per classifier for classifier in classifiers_per_fold[0]: file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '.dat', 'w+') file.close() file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_all.dat', 'w+') file.close() file = open(material_features_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_features.dat', 'w+') file.close() for fold_n, classifiers in enumerate(classifiers_per_fold): for classifier in classifiers: file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '.dat', 'a') if classifier.level > 0: index = [] for pred_n, prediction in enumerate(predictions_per_fold[fold_n][classifier.test_index, classifier.level-1]): if prediction == oracles_per_fold[fold_n][classifier.test_index[pred_n], classifier.level-1]: index.append(classifier.test_index[pred_n]) prediction_nw = predictions_per_fold[fold_n][index, classifier.level] oracle_nw = oracles_per_fold[fold_n][index, classifier.level] else: prediction_nw = predictions_per_fold[fold_n][classifier.test_index, classifier.level] oracle_nw = oracles_per_fold[fold_n][classifier.test_index, classifier.level] file.write('@fold\n') for o, p in zip(oracle_nw, prediction_nw): file.write(str(o)+' '+str(p)+'\n') file.close() file = open(material_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_all.dat', 'a') if classifier.level > 0: index = [] for pred_n, prediction in enumerate(predictions_per_fold_all[fold_n][classifier.test_index, classifier.level-1]): if prediction == oracles_per_fold[fold_n][classifier.test_index[pred_n], classifier.level-1]: index.append(classifier.test_index[pred_n]) prediction_all = predictions_per_fold_all[fold_n][index, classifier.level] oracle_all = oracles_per_fold[fold_n][index, classifier.level] else: prediction_all = predictions_per_fold_all[fold_n][classifier.test_index_all, classifier.level] oracle_all = oracles_per_fold_all[fold_n][classifier.test_index_all, classifier.level] file.write('@fold\n') for o, p in zip(oracle_all, prediction_all): file.write(str(o)+' '+str(p)+'\n') file.close() file = open(material_features_folder + 'multi_' + type_discr + '_level_' + str(classifier.level+1) + feat_discr + '_tag_' + str(classifier.tag) + '_features.dat', 'a') file.write('@fold\n') file.write(self.features_names[classifier.features_index[0]]) for feature_index in classifier.features_index[1:]: file.write(','+self.features_names[feature_index]) file.write('\n') file.close() graph_folder = './data_'+folder_discriminator+'/graph/' if not os.path.exists('./data_'+folder_discriminator): os.makedirs('./data_'+folder_discriminator) os.makedirs(graph_folder) elif not os.path.exists(graph_folder): os.makedirs(graph_folder) # Graph plot G = nx.DiGraph() for info in classifiers_per_fold[0]: G.add_node(str(info.level)+' '+info.tag, level=info.level, tag=info.tag, children_tags=info.children_tags) for node_parent, data_parent in G.nodes.items(): for node_child, data_child in G.nodes.items(): if data_child['level']-data_parent['level'] == 1 and any(data_child['tag'] in s for s in data_parent['children_tags']): G.add_edge(node_parent, node_child) nx.write_gpickle(G, graph_folder+'multi_' + type_discr + feat_discr +'_graph.gml') ### jvm.stop() sem.release()
def kfold_validation(self, k=10): sem.acquire() available_ram = psutil.virtual_memory()[1] available_ram = int(int(available_ram) * .9 * 1e-9) if available_ram > 5: jvm.start(max_heap_size='5g') else: jvm.start(max_heap_size=str(available_ram)+'g') ### print('\nCaricando '+self.input_file+' con opts -f'+str(self.features_number)+' -c'+self.classifier_name+'\n') # load .arff file dataset = arff.load(open(input_file, 'r')) data = np.array(dataset['data']) self.features_names = [x[0] for x in dataset['attributes']] self.attributes_number = data.shape[1] self.dataset_features_number = self.attributes_number - self.levels_number # Factorization of Nominal features_index encoder = CategoricalEncoder(encoding='ordinal') nominal_features_index = [i for i in range(len(dataset['attributes'][:-self.levels_number])) if dataset['attributes'][i][1] != u'NUMERIC'] if len(nominal_features_index) > 0: data[:, nominal_features_index] = encoder.fit_transform( data[:, nominal_features_index]) # Impute missing value by fitting over training set and transforming both sets imp = SimpleImputer(missing_values='NaN', strategy='most_frequent') data[:, :self.dataset_features_number] = imp.fit_transform( data[:, :self.dataset_features_number]) prediction = [] probability = [] oracle = [] print('\n***\nStart testing with ' + str(k)+'Fold cross-validation -f'+str(self.features_number)+' -c'+self.classifier_name+'\n***\n') bar = progressbar.ProgressBar(maxval=k, widgets=[progressbar.Bar( '=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() temp_metrics = [] skf = StratifiedKFold(n_splits=k, shuffle=True) bar_cnt = 0 for train_index, test_index in skf.split(data, data[:, self.dataset_features_number + self.tag_under_test]): self.training_set = data[train_index, :self.dataset_features_number] self.testing_set = data[test_index, :self.dataset_features_number] self.ground_through = data[train_index, self.dataset_features_number + self.tag_under_test] self.oracle = data[test_index, self.dataset_features_number + self.tag_under_test] self.prediction = np.ndarray( shape=[len(test_index), 1], dtype='<U24') self.probability = np.ndarray( shape=[len(test_index), len(set(self.ground_through))], dtype='<U24') classifier_to_call = getattr(self, supported_classifiers[self.classifier_name]) classifier_to_call() prediction.append(self.prediction) probability.append(self.probability) oracle.append(self.oracle) # print(type(prediction[bar_cnt])) # print(type(probability[bar_cnt])) bar_cnt += 1 bar.update(bar_cnt) bar.finish() relations = [] relations = [] relations.append({ # Lv2:Lv1 u'Tor': u'Tor', u'TorPT': u'Tor', u'TorApp': u'Tor', u'I2PApp80BW': u'I2P', u'I2PApp0BW': u'I2P', u'I2PApp': u'I2P', u'JonDonym': u'JonDonym' }) relations.append({ # Lv3:Lv2 u'JonDonym': u'JonDonym', u'I2PSNARK_App80BW': u'I2PApp80BW', u'IRC_App80BW': u'I2PApp80BW', u'Eepsites_App80BW': u'I2PApp80BW', u'I2PSNARK_App0BW': u'I2PApp0BW', u'IRC_App0BW': u'I2PApp0BW', u'Eepsites_App0BW': u'I2PApp0BW', u'I2PSNARK_App': u'I2PApp', u'IRC_App': u'I2PApp', u'Eepsites_App': u'I2PApp', u'ExploratoryTunnels_App': u'I2PApp', u'ParticipatingTunnels_App': u'I2PApp', u'Tor': u'Tor', u'Streaming': u'TorApp', u'Torrent': u'TorApp', u'Browsing': u'TorApp', u'Flashproxy': u'TorPT', u'FTE': u'TorPT', u'Meek': u'TorPT', u'Obfs3': u'TorPT', u'scramblesuit': u'TorPT' }) oracle_inferred = [] prediction_inferred = [] for i in range(self.tag_under_test): oracle_inferred.append(list()) prediction_inferred.append(list()) # Infering superior levels for i in range(k): # Assign of prediction to a dummy to use this one in consecutive label swaps inferred_prediction = prediction[i].copy() inferred_oracle = oracle[i].copy() for j in reversed(range(self.tag_under_test)): inferred_oracle = np.vectorize( relations[j].get)(list(inferred_oracle)) inferred_prediction = np.vectorize( relations[j].get)(list(inferred_prediction)) oracle_inferred[j].append(inferred_oracle) prediction_inferred[j].append(inferred_prediction) print('\n***\nStart testing with incremental gamma threshold\n***\n') bar = progressbar.ProgressBar(maxval=9, widgets=[progressbar.Bar( '=', '[', ']'), ' ', progressbar.Percentage()]) bar.start() oracle_gamma = [] prediction_gamma = [] classified_ratio = [] for i in range(9): gamma = float(i+1)/10.0 oracle_gamma.append(list()) prediction_gamma.append(list()) classified_ratio.append(list()) for j in range(k): indexes = [] p_cnt = 0 for p in probability[j]: if max(p) < gamma: indexes.append(p_cnt) p_cnt += 1 gamma_oracle = np.delete(oracle[j], [indexes]) gamma_prediction = np.delete(prediction[j], [indexes]) oracle_gamma[i].append(gamma_oracle) prediction_gamma[i].append(gamma_prediction) classified_ratio[i].append( float(len(gamma_prediction))/float(len(prediction[j]))) bar.update(i) bar.finish() data_folder = './data_'+self.classifier_name+'/material/' if not os.path.exists('./data_'+self.classifier_name): os.makedirs('./data_'+self.classifier_name) os.makedirs(data_folder) elif not os.path.exists(data_folder): os.makedirs(data_folder) if self.packets_number != 0: file = open(data_folder+'flat_early_level_'+str(self.level_target) + '_p_'+str(self.packets_number)+'.dat', 'w+') else: file = open(data_folder+'flat_flow_level_'+str(self.level_target) + '_f_'+str(self.features_number)+'.dat', 'w+') for i in range(k): file.write('@fold\n') for o, p in zip(oracle[i], prediction[i]): file.write(str(o)+' '+str(p)+'\n') file.close() for i in range(self.tag_under_test): if self.packets_number != 0: file = open(data_folder+'flat_early_level_'+str(self.level_target) + '_p_'+str(self.packets_number)+'_inferred_'+str(i+1)+'.dat', 'w+') else: file = open(data_folder+'flat_flow_level_'+str(self.level_target) + '_f_'+str(self.features_number)+'_inferred_'+str(i+1)+'.dat', 'w+') for j in range(k): file.write('@fold\n') for o, p in zip(oracle_inferred[i][j], prediction_inferred[i][j]): file.write(str(o)+' '+str(p)+'\n') file.close() for i in range(9): if self.packets_number != 0: file = open(data_folder+'flat_early_level_'+str(self.level_target)+'_p_' + str(self.packets_number)+'_gamma_'+str(float(i+1)/10.0)+'.dat', 'w+') else: file = open(data_folder+'flat_flow_level_'+str(self.level_target)+'_f_' + str(self.features_number)+'_gamma_'+str(float(i+1)/10.0)+'.dat', 'w+') for j in range(k): file.write('@fold_cr\n') file.write(str(classified_ratio[i][j])+'\n') for o, p in zip(oracle_gamma[i][j], prediction_gamma[i][j]): file.write(str(o)+' '+str(p)+'\n') file.close() ### jvm.stop() sem.release()
# as well as the technician part of this category. ######################################################################### # Encoding categorical data using SimilarityEncoder # ------------------------------------------------- # # A typical data-science workflow uses one-hot encoding to represent # categories. from sklearn.preprocessing import CategoricalEncoder # encoding simply a subset of the observations n_obs = 20 employee_position_titles = values['Employee Position Title'].head( n_obs).to_frame() categorical_encoder = CategoricalEncoder(encoding='onehot-dense') one_hot_encoded = categorical_encoder.fit_transform(employee_position_titles) f3, ax3 = plt.subplots(figsize=(6, 6)) ax3.matshow(one_hot_encoded) ax3.set_title('Employee Position Title values, one-hot encoded') ax3.axis('off') f3.tight_layout() ######################################################################### # The corresponding is very sparse # # SimilarityEncoder can be used to replace one-hot encoding capturing the # similarities: f4, ax4 = plt.subplots(figsize=(6, 6)) similarity_encoded = similarity_encoder.fit_transform(employee_position_titles) ax4.matshow(similarity_encoded)
def extract_features(df_train, df_inference, selected_feature_names_categ, selected_feature_names_interval, shuffle=True, fuzzy_matching=True, use_onehot=True, use_sentence_vec=False): features_to_use = [] variable_types = [] if not (use_onehot): for feature in selected_feature_names_categ: features_to_use.append(feature + '_encoded') variable_types.append('categorical_nominal') # Append interval AFTER categorical!! for feature in selected_feature_names_interval: features_to_use.append(feature + '_normed') variable_types.append('numerical') # Check to ensure all cols exist (avoid keyerrors) for df in [df_train, df_inference]: df[selected_feature_names_categ + selected_feature_names_interval] print(df['combined_str']) # for feature in selected_feature_names_categ: # le = preprocessing.LabelEncoder() # print(print_attr_overview(df[feature], True, topn=10)) # df[feature + '_encoded'] = le.fit_transform(df[feature]) # features_to_use.append(feature + '_encoded') if use_onehot: # Each Feature has its own vocab... vocabs = defaultdict(list) X = pd.concat([df_train[colnames_categ], df_inference[colnames_categ]]) X = df_train[colnames_categ] X = X.apply(preprocess_categ_series) enc = CategoricalEncoder(handle_unknown='ignore') enc.fit_transform(X) # pprint(enc.categories_) else: le = preprocessing.LabelEncoder() all_unique = [] # FIT LABEL_ENCODER (combine vocabs for train and inference) for df in [df_train, df_inference]: for feature in selected_feature_names_categ: # print(print_attr_overview(df[feature])) s = df[feature] # Remove categorical entries with less than 10 occurances a = s.value_counts() s[s.isin(a.index[a < 12])] = np.nan s[s.isnull()] = "EMPTY_PLACEHOLDER" s = s.map(lambda x: x.lower() if type(x) == str else x) # print(np.unique(df[feature])) all_unique.extend(np.unique(s)) le.fit(all_unique) # TRANSFORM LABEL_ENCODER for df in [df_train, df_inference]: for feature in selected_feature_names_categ: print(feature) # print(df[feature]) s = df[feature] s = s.map(lambda x: x.lower() if type(x) == str else x) df[feature + '_encoded'] = le.transform(s) print(feature, len(np.unique(s))) for df in [df_train, df_inference]: for feature in selected_feature_names_interval: s = df[feature] s = s.map(lambda x: x.replace(',', '') if type(x) == str else x) # print(s) s = pd.to_numeric(s, errors='coerce') # Set null values to zero # TODO: try set nan to the mean instead of zero # TODO: try different types of normalisation s[np.logical_not(s.notnull())] = 0.0 df[feature + '_normed'] = norm_zscore(s) # features_to_use.append('sentence_vec') # variable_types.append('embedding') if use_sentence_vec: from ft_embedding import get_sentence_vec print('Computing sentence vectors for dataset') train_embedding_mat = np.asarray( [get_sentence_vec(x) for x in df_train['combined_str']]) inference_embedding_mat = np.asarray( [get_sentence_vec(x) for x in df_inference['combined_str']]) variable_types.append('ft_embedding') if use_onehot: print(features_to_use) # One-Hot Categorical Encoding train_X_onehot = enc.transform(df_train[colnames_categ]).toarray() train_X_interval = df_train[features_to_use].as_matrix() print(train_X_onehot.shape) print(train_X_interval.shape) train_X = np.hstack([train_X_onehot, train_X_interval]) inference_X_onehot = enc.transform( df_inference[colnames_categ]).toarray() inference_X_interval = df_inference[features_to_use].as_matrix() print(inference_X_onehot.shape) print(inference_X_interval.shape) inference_X = np.hstack([inference_X_onehot, inference_X_interval]) # Add (one-hot encoded) numerical features to variable_types len_onehot = train_X_onehot.shape[1] print(len_onehot) features_to_use = ['numerical' for i in range(len_onehot)] + features_to_use else: # Index Categorical Encoding (integer) train_X = df_train[features_to_use].as_matrix() inference_X = df_inference[features_to_use].as_matrix() train_y = df_train['case_status'].as_matrix() if use_sentence_vec: # Stack with sentence embedding train_X = np.hstack([train_X.copy(), train_embedding_mat]) inference_X = np.hstack([inference_X.copy(), inference_embedding_mat]) print(train_embedding_mat.shape) print(inference_embedding_mat.shape) print(train_X.shape) print(inference_X.shape) # exit() inference_row_id = df_inference['row ID'] if shuffle: train_X, train_y = skl_shuffle(train_X, train_y) # print(X.shape) # print(y.shape) if use_onehot: vocab_size = 0 else: vocab_size = len(list(le.classes_)) return train_X, train_y, inference_row_id, inference_X, vocab_size, variable_types, features_to_use