def test_edge_map_works_weighted_self_edged_normalized(self):
        test_data = sp.lil_matrix([[0, 1], [1, 0], [1, 1], [1, 1]])

        base_class = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=True, normalize_self_edges=True)
        edge_map = base_class.transform(test_data)

        self.assertEqual(len(edge_map), 3)
        self.assertEqual(edge_map[(0, 1)], 2)
        self.assertEqual(edge_map[(1, 1)], 1.5)
    def test_edge_map_works_unweighted_non_self_edged_non_normalized(self):
        test_data = sp.lil_matrix([[0, 1], [1, 0], [1, 1], [1, 1]])

        base_class = LabelCooccurrenceGraphBuilder(weighted=False, include_self_edges=False, normalize_self_edges=False)
        edge_map = base_class.transform(test_data)

        self.assertEqual(len(edge_map), 1)
        self.assertEqual(edge_map[(0, 1)], 1)
        self.assertNotIn((1, 1), edge_map)
示例#3
0
    def test_edge_map_works_weighted_self_edged_normalized(self):
        test_data = sp.lil_matrix([[0, 1], [1, 0], [1, 1], [1, 1]])

        base_class = LabelCooccurrenceGraphBuilder(weighted=True,
                                                   include_self_edges=True,
                                                   normalize_self_edges=True)
        edge_map = base_class.transform(test_data)

        self.assertEqual(len(edge_map), 3)
        self.assertEqual(edge_map[(0, 1)], 2)
        self.assertEqual(edge_map[(1, 1)], 1.5)
示例#4
0
    def test_edge_map_works_unweighted_non_self_edged_non_normalized(self):
        test_data = sp.lil_matrix([[0, 1], [1, 0], [1, 1], [1, 1]])

        base_class = LabelCooccurrenceGraphBuilder(weighted=False,
                                                   include_self_edges=False,
                                                   normalize_self_edges=False)
        edge_map = base_class.transform(test_data)

        self.assertEqual(len(edge_map), 1)
        self.assertEqual(edge_map[(0, 1)], 1)
        self.assertNotIn((1, 1), edge_map)
    def classifiers(self):
        graph_builder = LabelCooccurrenceGraphBuilder(weighted=True,
                                                      include_self_edges=False)

        param_dicts = {
            'GraphFactorization': dict(epoch=1),
            'GraRep': dict(Kstep=2),
            'HOPE': dict(),
            'LaplacianEigenmaps': dict(),
            'LINE': dict(epoch=1, order=1),
            'LLE': dict(),
        }

        if not (sys.version_info[0] == 2
                or platform.architecture()[0] == '32bit'):
            for embedding in OpenNetworkEmbedder._EMBEDDINGS:
                if embedding == 'LLE':
                    dimension = 3
                else:
                    dimension = 4

                yield EmbeddingClassifier(
                    OpenNetworkEmbedder(copy(graph_builder), embedding,
                                        dimension, 'add', True,
                                        param_dicts[embedding]),
                    LinearRegression(), MLkNN(k=2))

        yield EmbeddingClassifier(
            SKLearnEmbedder(SpectralEmbedding(n_components=2)),
            LinearRegression(), MLkNN(k=2))

        EmbeddingClassifier(CLEMS(metrics.accuracy_score, True),
                            LinearRegression(), MLkNN(k=2), True)
示例#6
0
def supported_graphbuilder_generator():
    for weighted in [True, False]:
        for include_self_edges in [True, False]:
            normalize_cases = [False]
            if weighted and include_self_edges:
                normalize_cases.append(True)
            for normalize_self_edges in normalize_cases:
                yield LabelCooccurrenceGraphBuilder(
                    weighted=weighted,
                    include_self_edges=include_self_edges,
                    normalize_self_edges=normalize_self_edges)
示例#7
0
def run_test2(normas, n_jobs=1):
    models = [[('tfidf', TfidfVectorizer(min_df=20, max_df=0.5))]]

    clfs = [{
        'clf':
        ('mv_mlp', MajorityVotingClassifier(classifier=MLPClassifier())),
        'params': {
            'mv_mlp__classifier__hidden_layer_sizes': [(150), (100, 100),
                                                       (50, 50, 50)],
            'mv_mlp__classifier__activation': ['tanh', 'relu'],
            'mv_mlp__clusterer': [
                NetworkXLabelGraphClusterer(
                    LabelCooccurrenceGraphBuilder(weighted=True,
                                                  include_self_edges=False),
                    'louvain'),
                NetworkXLabelGraphClusterer(
                    LabelCooccurrenceGraphBuilder(weighted=True,
                                                  include_self_edges=False),
                    'lpa')
            ]
        }
    }]
    run(normas, models, clfs, n_jobs)
示例#8
0
class LabelRank:
    """
    A simple Python implementation of the LabelRank method proposed by
     Bin Fu (2018) in 'Learning label dependency for multi-label classification'
    """
    def __init__(self, a=0.3, tol=0.01):
        self.a = a
        self.tol = tol
        self.T = None
        self.graph_builder = LabelCooccurrenceGraphBuilder(
            weighted=True, include_self_edges=False)

    def fit(self, y):
        if isinstance(y, list): y = np.array(y)
        edge_map = self.graph_builder.transform(y)
        W = np.zeros((y.shape[1], y.shape[1]))
        for target, source in edge_map:
            W[target][source] = edge_map[(target, source)]
            W[source][target] = edge_map[(target, source)]
        S = normalize(W, norm='l1', axis=0)
        self.T = normalize(S, norm='l1', axis=1)

    def transform(self, probas):
        if self.T is None:
            raise NotFittedError(
                'Model is not fitted. Fit LabelRank model first.')
        probas = np.array(probas)
        transformed_probas = []
        for proba in probas:
            diff = 1
            p_x_t = proba
            while diff > self.tol:
                p_x_t_1 = self.a * self.T.dot(p_x_t) + ((1 - self.a) * proba)
                diff = sum(abs((p_x_t_1 - p_x_t) / p_x_t_1 * 100))
                p_x_t = p_x_t_1
            transformed_probas.append(p_x_t)
        return np.array(transformed_probas)
示例#9
0
scoring_funcs = {
    "hamming loss": hamming_func,
    "aiming": aiming_func,
    "coverage": coverage_func,
    "accuracy": accuracy_func,
    "absolute true": absolute_true_func,
    "absolute false": absolute_false_func
}  # Keep recorded
parameters = {
    'classifier': [LabelPowerset()],
    'classifier__classifier': [ExtraTreesClassifier()],
    'classifier__classifier__n_estimators': [100, 500],
    'clusterer': [
        NetworkXLabelGraphClusterer(
            LabelCooccurrenceGraphBuilder(weighted=True,
                                          include_self_edges=False),
            'louvain'),
        NetworkXLabelGraphClusterer(
            LabelCooccurrenceGraphBuilder(weighted=True,
                                          include_self_edges=False), 'lpa')
    ]
}

ext = GridSearchCV(LabelSpacePartitioningClassifier(),
                   param_grid=parameters,
                   n_jobs=-1,
                   cv=loocv,
                   scoring=scoring_funcs,
                   verbose=0,
                   refit="absolute true")
ext.fit(X.T, Y.T)
@author: hamishgibbs
"""
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.problem_transform import LabelPowerset
from skmultilearn.cluster import IGraphLabelGraphClusterer, LabelCooccurrenceGraphBuilder, StochasticBlockModel, GraphToolLabelGraphClusterer
from skmultilearn.ensemble import LabelSpacePartitioningClassifier
from skmultilearn.dataset import load_dataset
X_train, y_train, feature_names, label_names = load_dataset(
    'emotions', 'train')
X_test, y_test, _, _ = load_dataset('emotions', 'test')

#%%
base_classifier = RandomForestClassifier(n_estimators=1000)
#%%
graph_builder = LabelCooccurrenceGraphBuilder(weighted=True,
                                              include_self_edges=False)
#%%
model = StochasticBlockModel(nested=False,
                             use_degree_correlation=True,
                             allow_overlap=False,
                             weight_model='real-normal')
#%%
problem_transform_classifier = LabelPowerset(classifier=base_classifier,
                                             require_dense=[False, False])
#%%
clusterer = GraphToolLabelGraphClusterer(graph_builder=graph_builder,
                                         model=model)

#%%
classifier = LabelSpacePartitioningClassifier(problem_transform_classifier,
                                              clusterer)
示例#11
0
def mlknn(train_tasks, test_tasks, train_meta, test_meta, x_spl, y_spl):

    with open('predictions_new_OpenNetworkEmbedder.pickle', 'rb') as f:
        data = pickle.load(f)
        predictions_new = data['predictions']
        all_tags = data['all_tags']
    all_tags = []
    for problem in train_tasks:
        all_tags += problem['tags']
    for problem in test_tasks:
        all_tags += problem['tags']
    all_tags = list(set(all_tags))

    x_train, y_train = transform_data(train_tasks, train_meta, all_tags)
    x_test, y_test = transform_data(test_tasks, test_meta, all_tags)

    vectorizer = TfidfVectorizer()
    vectorizer.fit(x_train)
    vectorizer.fit(x_test)
    x_train_transformed = vectorizer.transform(x_train)
    x_test_transformed = vectorizer.transform(x_test)

    x_train = lil_matrix(x_train_transformed).toarray()
    y_train = lil_matrix(y_train).toarray()
    x_test = lil_matrix(x_test_transformed).toarray()
    y_test = lil_matrix(y_test).toarray()

    from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier
    from sklearn.manifold import SpectralEmbedding
    from sklearn.ensemble import RandomForestRegressor
    from skmultilearn.adapt import MLkNN
    for n in (10000000, ):
        clf = EmbeddingClassifier(
            SKLearnEmbedder(SpectralEmbedding(n_components=10)),
            RandomForestRegressor(n_estimators=10), MLkNN(k=5))

        clf.fit(x_train[:n], y_train[:n])

        predictions = clf.predict(x_test)

        from sklearn.metrics import accuracy_score
        print("Accuracy = ", accuracy_score(y_test, predictions))

    graph_builder = LabelCooccurrenceGraphBuilder(weighted=True,
                                                  include_self_edges=False)
    openne_line_params = dict(batch_size=1000, order=3)
    embedder = OpenNetworkEmbedder(graph_builder,
                                   'LINE',
                                   dimension=5 * y_train.shape[1],
                                   aggregation_function='add',
                                   normalize_weights=True,
                                   param_dict=openne_line_params)

    clf = EmbeddingClassifier(embedder, RandomForestRegressor(n_estimators=10),
                              MLkNN(k=5))

    clf.fit(x_train, y_train)
    with open('model.pickle', 'wb') as f:
        pickle.dump({'all_tags': all_tags, 'model': clf}, f)

    predictions_new = clf.predict(x_test)

    # with open('predictions_new_OpenNetworkEmbedder.pickle', 'wb') as f:
    #     pickle.dump({'all_tags': all_tags, 'predictions': predictions_new}, f)
    # with open('predictions_new_OpenNetworkEmbedder.pickle', 'rb') as f:
    #     data = pickle.load(f)
    #     predictions_new = data['predictions']
    #     all_tags_pickle = data['all_tags']
    #     # permutate y to be consistent with
    #     y_test_new = y_test
    # return predictions_new, all

    for predicted, real in zip(predictions_new.toarray(), y_test):
        pt = []
        rt = []
        # for ap, aap in zip(predicted, all_tags_pickle):
        #     if ap:
        #         pt.append(aap)
        # for ar, aa in zip(real, all_tags):
        #     if ar:
        #         rt.append(aa)

        for ap, ar, aa in zip(predicted, real, all_tags):
            if ap:
                pt.append(aa)
            if ar:
                rt.append(aa)
        print('predicted', pt)
        print('real', rt)
        print('-' * 20, '\n')

    from sklearn.metrics import accuracy_score
    print("Accuracy = ", accuracy_score(y_test, predictions_new))
    print("Accuracy = ", accuracy_score(predictions_new, y_test))
示例#12
0
print(mlknn.best_score_)

parameters =  {'c_k': [2**i for i in range(-5, 5)]}

mtsvm = GridSearchCV(MLTSVM(), param_grid=parameters, n_jobs=-1, cv=loocv, 
                    scoring=scoring_funcs, verbose=3, refit="absolute true")

mtsvm.fit(X, Y.values)
print(mtsvm.best_score_)

parameters = {
    'classifier': [LabelPowerset()],
    'classifier__classifier': [ExtraTreesClassifier()],
    'classifier__classifier__n_estimators': [50, 100, 500, 1000],
    'clusterer' : [
        NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'),
        NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa')
    ]
}


ext = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv, 
                    scoring=scoring_funcs, verbose=3, refit="absolute true")
ext.fit(X, Y.values)
print(ext.best_score_)


parameters = {
    'classifier': [LabelPowerset()],
    'classifier__classifier': [RandomForestClassifier()],
    'classifier__classifier__n_estimators': [50, 100, 500, 1000],
示例#13
0
 def __init__(self, a=0.3, tol=0.01):
     self.a = a
     self.tol = tol
     self.T = None
     self.graph_builder = LabelCooccurrenceGraphBuilder(
         weighted=True, include_self_edges=False)