Пример #1
0
class Multi_classes_classifier_on_column(BaseEstimator):
    def __init__(self, base_classifier, column):
        self.column = column
        self.classifier = ClassifierChain(base_classifier)
        self.vectorizer = None

    def _get_vectors(self, X):
        text_data = X[self.column]
        text_data = [prepro.clean_text(text)
                     for text in text_data]  # text cleaning
        feature_vector = self.vectorizer.transform(text_data).toarray()
        return feature_vector

    def fit(self, X, y):
        if type(self.column) == type(int(1)):
            self.column = list(X.columns)[self.column]
        if type(self.vectorizer) == type(None):
            self.vectorizer = prepro.get_text_vectorizer(X, self.column)

        feature_vector = self._get_vectors(X)
        self.classifier.fit(feature_vector, y)
        return self

    def predict(self, X):
        feature_vector = self._get_vectors(X)
        result = self.classifier.predict(feature_vector)
        return result

    def predict_proba(self, X: pd.DataFrame):
        feature_vector = self._get_vectors(X)
        result = self.classifier.predict_proba(feature_vector)
        return result

    def partial_fit(self, X, y):
        feature_vector = self._get_vectors(X)

        result = self.classifier.partial_fit(feature_vector, y)
        return result

    def score(self, X, y):
        feature_vector = self._get_vectors(X)
        result = self.classifier.score(feature_vector, y)
        return result

    def set_params(self, **params):
        self.classifier.set_params(**params)
        return self

    def get_params(self, deep):
        result = self.classifier.get_params(deep)
        return result

    def set_vectorizer(self, vectorizer):
        self.vectorizer = vectorizer
Пример #2
0
def test_classifier_chain_fit_and_predict_with_logistic_regression():
    # Fit classifier chain and verify predict performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_prob = classifier_chain.predict_proba(X)
    Y_binary = (Y_prob >= .5)
    assert_array_equal(Y_binary, Y_pred)

    assert_equal([c.coef_.size for c in classifier_chain.estimators_],
                 list(range(X.shape[1], X.shape[1] + Y.shape[1])))
Пример #3
0
def test_classifier_chain_fit_and_predict_with_logistic_regression():
    # Fit classifier chain and verify predict performance
    X, Y = generate_multilabel_dataset_with_correlations()
    classifier_chain = ClassifierChain(LogisticRegression())
    classifier_chain.fit(X, Y)

    Y_pred = classifier_chain.predict(X)
    assert_equal(Y_pred.shape, Y.shape)

    Y_prob = classifier_chain.predict_proba(X)
    Y_binary = (Y_prob >= .5)
    assert_array_equal(Y_binary, Y_pred)

    assert_equal([c.coef_.size for c in classifier_chain.estimators_],
                 list(range(X.shape[1], X.shape[1] + Y.shape[1])))
Пример #4
0
features = [item.split(" ") for item in train_df]
col_dicts = [make_dict(entry) for entry in features]

features_val = [item.split(" ") for item in val_df]
col_dicts_val = [make_dict(entry) for entry in features_val]

features_df = pd.DataFrame(col_dicts)
features_df_val = pd.DataFrame(col_dicts_val)

features_df = features_df.fillna(0)
features_df_val = features_df_val.fillna(0)
print('done cleanning')
X_train = np.array(features_df)
Y_train = np.array(encoded_labels_df)
x_val = np.array(features_df_val)
y_val = np.array(encoded_labels_df_val)

base_lr = LogisticRegression(max_iter=MAX_ITER, n_jobs=-1, verbose=1)

int_rand = np.random.randint(1000)
chain = ClassifierChain(base_lr, order='random', random_state=int_rand)

chain.fit(X_train, Y_train)
filename = MAX_ITER + "_" + int_ran + ".pickle"
pickle.dump(chain, open(filename, 'wb'))

#loaded_model = pickle.load(open(filename, 'rb'))
print('start predict')
Y_pred_chains = np.array([chain.predict_proba(x_val) for chain in chains])
Пример #5
0
def naive_base(params):
    building_list = params[0]
    n_list = params[1]
    target_building = params[2]
    inc_num = params[3]
    iter_num = params[4]
    accuracy_list = list()
    micro_f1_list = list()
    macro_f1_list = list()
    for iter_i in range(0, iter_num):
        sentence_dict = dict()
        truth_dict = dict()
        if iter_i == 0:
            learning_srcids = list()
        for building, n in zip(building_list, n_list):
            if building == target_building:
                n += iter_i * inc_num
            if building != 'ghc':
                (sensorDF, srcid_list, name_list, jciname_list, desc_list,
                 unit_list, bacnettype_list) = toker.parse_sentences(building)
                for srcid, name, jciname, desc in \
                        zip(srcid_list, name_list, jciname_list, desc_list):
                    sentence_dict[srcid] = list(
                        map(replacer, name + jciname + desc))
            else:
                with open(
                        'metadata/{0}_sentence_dict_justseparate.json'.format(
                            building), 'r') as fp:
                    curr_sentence_dict = json.load(fp)

                curr_sentence_dict = dict([
                    (srcid, list(map(replacer, sentence)))
                    for srcid, sentence in curr_sentence_dict.items()
                ])
                sentence_dict.update(curr_sentence_dict)

            with open('metadata/{0}_ground_truth.json'.format(building),
                      'r') as fp:
                truth_dict.update(json.load(fp))
            label_dict = get_label_dict(building)
            srcids = list(truth_dict.keys())

            if iter_i == 0:
                learning_srcids += select_random_samples(
                    building,
                    srcids,
                    n,
                    True,
                    token_type='justseparate',
                    reverse=True,
                    cluster_dict=None,
                    shuffle_flag=False)
            else:
                learning_srcids += new_srcids * 3
                pass
            if building == target_building:
                test_srcids = [
                    srcid for srcid in label_dict.keys()
                    if srcid not in learning_srcids
                ]

        binarizer = MultiLabelBinarizer().fit(truth_dict.values())
        vectorizer = TfidfVectorizer(tokenizer=tokenizer).fit(
            list(map(joiner, sentence_dict.values())))
        learning_doc = [
            ' '.join(sentence_dict[srcid]) for srcid in learning_srcids
        ]
        learning_vect_doc = vectorizer.transform(learning_doc)

        learning_truth_mat = binarizer.transform(
            [truth_dict[srcid] for srcid in learning_srcids])

        #classifier = RandomForestClassifier(n_estimators=200, n_jobs=1)
        classifier = ClassifierChain(RandomForestClassifier())
        classifier.fit(learning_vect_doc, learning_truth_mat)

        test_doc = [' '.join(sentence_dict[srcid]) for srcid in test_srcids]
        test_vect_doc = vectorizer.transform(test_doc)

        pred_mat = classifier.predict(test_vect_doc)
        prob_mat = classifier.predict_proba(test_vect_doc)

        # Query Stage for Active Learning
        entropies = [get_entropy(prob) for prob in prob_mat]
        sorted_entropies = sorted([(test_srcids[i], entropy)
                                   for i, entropy in enumerate(entropies)],
                                  key=itemgetter(1),
                                  reverse=True)
        added_cids = set()
        """
        for srcid in learning_srcids:
            cid = find_keys(srcid, cluster_dict, crit=lambda x,y:x in y)[0]
            added_cids.add(cid)
            """

        new_srcids = []
        new_srcid_cnt = 0
        cluster_dict = get_cluster_dict(target_building)
        for srcid, entropy in sorted_entropies:
            if srcid not in learning_srcids:
                the_cid = None
                for cid, cluster in cluster_dict.items():
                    if srcid in cluster:
                        the_cid = cid
                        break
                if the_cid in added_cids:
                    continue
                added_cids.add(the_cid)
                new_srcids.append(srcid)
                new_srcid_cnt += 1
                if new_srcid_cnt == inc_num:
                    break

        pred_tagsets_list = binarizer.inverse_transform(pred_mat)
        pred_tagsets_dict = dict([
            (srcid, pred_tagset)
            for srcid, pred_tagset in zip(test_srcids, pred_tagsets_list)
        ])

        correct_cnt = 0
        incorrect_cnt = 0
        for i, srcid in enumerate(test_srcids):
            pred = pred_tagsets_dict[srcid]
            true = truth_dict[srcid]
            if set(pred_tagsets_dict[srcid]) != set(truth_dict[srcid]):
                incorrect_cnt += 1
            else:
                correct_cnt += 1

        test_truth_mat = binarizer.transform(
            [truth_dict[srcid] for srcid in test_srcids])

        if not isinstance(pred_mat, np.ndarray):
            pred_mat = pred_mat.toarray()
        if not isinstance(test_truth_mat, np.ndarray):
            test_truth_mat = test_truth_mat.toarray()

        accuracy = get_accuracy(test_truth_mat, pred_mat)
        micro_f1 = get_micro_f1(test_truth_mat, pred_mat)
        #_, _, macro_f1, _ = precision_recall_fscore_support(test_truth_mat,
        #                                            pred_mat, average='macro')
        macro_f1 = get_macro_f1(test_truth_mat, pred_mat)
        accuracy_list.append(accuracy * 100)
        micro_f1_list.append(micro_f1 * 100)
        macro_f1_list.append(macro_f1 * 100)

    return accuracy_list, macro_f1_list