class Multi_classes_classifier_on_column(BaseEstimator): def __init__(self, base_classifier, column): self.column = column self.classifier = ClassifierChain(base_classifier) self.vectorizer = None def _get_vectors(self, X): text_data = X[self.column] text_data = [prepro.clean_text(text) for text in text_data] # text cleaning feature_vector = self.vectorizer.transform(text_data).toarray() return feature_vector def fit(self, X, y): if type(self.column) == type(int(1)): self.column = list(X.columns)[self.column] if type(self.vectorizer) == type(None): self.vectorizer = prepro.get_text_vectorizer(X, self.column) feature_vector = self._get_vectors(X) self.classifier.fit(feature_vector, y) return self def predict(self, X): feature_vector = self._get_vectors(X) result = self.classifier.predict(feature_vector) return result def predict_proba(self, X: pd.DataFrame): feature_vector = self._get_vectors(X) result = self.classifier.predict_proba(feature_vector) return result def partial_fit(self, X, y): feature_vector = self._get_vectors(X) result = self.classifier.partial_fit(feature_vector, y) return result def score(self, X, y): feature_vector = self._get_vectors(X) result = self.classifier.score(feature_vector, y) return result def set_params(self, **params): self.classifier.set_params(**params) return self def get_params(self, deep): result = self.classifier.get_params(deep) return result def set_vectorizer(self, vectorizer): self.vectorizer = vectorizer
def test_classifier_chain_fit_and_predict_with_logistic_regression(): # Fit classifier chain and verify predict performance X, Y = generate_multilabel_dataset_with_correlations() classifier_chain = ClassifierChain(LogisticRegression()) classifier_chain.fit(X, Y) Y_pred = classifier_chain.predict(X) assert_equal(Y_pred.shape, Y.shape) Y_prob = classifier_chain.predict_proba(X) Y_binary = (Y_prob >= .5) assert_array_equal(Y_binary, Y_pred) assert_equal([c.coef_.size for c in classifier_chain.estimators_], list(range(X.shape[1], X.shape[1] + Y.shape[1])))
features = [item.split(" ") for item in train_df] col_dicts = [make_dict(entry) for entry in features] features_val = [item.split(" ") for item in val_df] col_dicts_val = [make_dict(entry) for entry in features_val] features_df = pd.DataFrame(col_dicts) features_df_val = pd.DataFrame(col_dicts_val) features_df = features_df.fillna(0) features_df_val = features_df_val.fillna(0) print('done cleanning') X_train = np.array(features_df) Y_train = np.array(encoded_labels_df) x_val = np.array(features_df_val) y_val = np.array(encoded_labels_df_val) base_lr = LogisticRegression(max_iter=MAX_ITER, n_jobs=-1, verbose=1) int_rand = np.random.randint(1000) chain = ClassifierChain(base_lr, order='random', random_state=int_rand) chain.fit(X_train, Y_train) filename = MAX_ITER + "_" + int_ran + ".pickle" pickle.dump(chain, open(filename, 'wb')) #loaded_model = pickle.load(open(filename, 'rb')) print('start predict') Y_pred_chains = np.array([chain.predict_proba(x_val) for chain in chains])
def naive_base(params): building_list = params[0] n_list = params[1] target_building = params[2] inc_num = params[3] iter_num = params[4] accuracy_list = list() micro_f1_list = list() macro_f1_list = list() for iter_i in range(0, iter_num): sentence_dict = dict() truth_dict = dict() if iter_i == 0: learning_srcids = list() for building, n in zip(building_list, n_list): if building == target_building: n += iter_i * inc_num if building != 'ghc': (sensorDF, srcid_list, name_list, jciname_list, desc_list, unit_list, bacnettype_list) = toker.parse_sentences(building) for srcid, name, jciname, desc in \ zip(srcid_list, name_list, jciname_list, desc_list): sentence_dict[srcid] = list( map(replacer, name + jciname + desc)) else: with open( 'metadata/{0}_sentence_dict_justseparate.json'.format( building), 'r') as fp: curr_sentence_dict = json.load(fp) curr_sentence_dict = dict([ (srcid, list(map(replacer, sentence))) for srcid, sentence in curr_sentence_dict.items() ]) sentence_dict.update(curr_sentence_dict) with open('metadata/{0}_ground_truth.json'.format(building), 'r') as fp: truth_dict.update(json.load(fp)) label_dict = get_label_dict(building) srcids = list(truth_dict.keys()) if iter_i == 0: learning_srcids += select_random_samples( building, srcids, n, True, token_type='justseparate', reverse=True, cluster_dict=None, shuffle_flag=False) else: learning_srcids += new_srcids * 3 pass if building == target_building: test_srcids = [ srcid for srcid in label_dict.keys() if srcid not in learning_srcids ] binarizer = MultiLabelBinarizer().fit(truth_dict.values()) vectorizer = TfidfVectorizer(tokenizer=tokenizer).fit( list(map(joiner, sentence_dict.values()))) learning_doc = [ ' '.join(sentence_dict[srcid]) for srcid in learning_srcids ] learning_vect_doc = vectorizer.transform(learning_doc) learning_truth_mat = binarizer.transform( [truth_dict[srcid] for srcid in learning_srcids]) #classifier = RandomForestClassifier(n_estimators=200, n_jobs=1) classifier = ClassifierChain(RandomForestClassifier()) classifier.fit(learning_vect_doc, learning_truth_mat) test_doc = [' '.join(sentence_dict[srcid]) for srcid in test_srcids] test_vect_doc = vectorizer.transform(test_doc) pred_mat = classifier.predict(test_vect_doc) prob_mat = classifier.predict_proba(test_vect_doc) # Query Stage for Active Learning entropies = [get_entropy(prob) for prob in prob_mat] sorted_entropies = sorted([(test_srcids[i], entropy) for i, entropy in enumerate(entropies)], key=itemgetter(1), reverse=True) added_cids = set() """ for srcid in learning_srcids: cid = find_keys(srcid, cluster_dict, crit=lambda x,y:x in y)[0] added_cids.add(cid) """ new_srcids = [] new_srcid_cnt = 0 cluster_dict = get_cluster_dict(target_building) for srcid, entropy in sorted_entropies: if srcid not in learning_srcids: the_cid = None for cid, cluster in cluster_dict.items(): if srcid in cluster: the_cid = cid break if the_cid in added_cids: continue added_cids.add(the_cid) new_srcids.append(srcid) new_srcid_cnt += 1 if new_srcid_cnt == inc_num: break pred_tagsets_list = binarizer.inverse_transform(pred_mat) pred_tagsets_dict = dict([ (srcid, pred_tagset) for srcid, pred_tagset in zip(test_srcids, pred_tagsets_list) ]) correct_cnt = 0 incorrect_cnt = 0 for i, srcid in enumerate(test_srcids): pred = pred_tagsets_dict[srcid] true = truth_dict[srcid] if set(pred_tagsets_dict[srcid]) != set(truth_dict[srcid]): incorrect_cnt += 1 else: correct_cnt += 1 test_truth_mat = binarizer.transform( [truth_dict[srcid] for srcid in test_srcids]) if not isinstance(pred_mat, np.ndarray): pred_mat = pred_mat.toarray() if not isinstance(test_truth_mat, np.ndarray): test_truth_mat = test_truth_mat.toarray() accuracy = get_accuracy(test_truth_mat, pred_mat) micro_f1 = get_micro_f1(test_truth_mat, pred_mat) #_, _, macro_f1, _ = precision_recall_fscore_support(test_truth_mat, # pred_mat, average='macro') macro_f1 = get_macro_f1(test_truth_mat, pred_mat) accuracy_list.append(accuracy * 100) micro_f1_list.append(micro_f1 * 100) macro_f1_list.append(macro_f1 * 100) return accuracy_list, macro_f1_list