def test_edge_map_works_weighted_self_edged_normalized(self): test_data = sp.lil_matrix([[0, 1], [1, 0], [1, 1], [1, 1]]) base_class = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=True, normalize_self_edges=True) edge_map = base_class.transform(test_data) self.assertEqual(len(edge_map), 3) self.assertEqual(edge_map[(0, 1)], 2) self.assertEqual(edge_map[(1, 1)], 1.5)
def test_edge_map_works_unweighted_non_self_edged_non_normalized(self): test_data = sp.lil_matrix([[0, 1], [1, 0], [1, 1], [1, 1]]) base_class = LabelCooccurrenceGraphBuilder(weighted=False, include_self_edges=False, normalize_self_edges=False) edge_map = base_class.transform(test_data) self.assertEqual(len(edge_map), 1) self.assertEqual(edge_map[(0, 1)], 1) self.assertNotIn((1, 1), edge_map)
def test_edge_map_works_weighted_self_edged_normalized(self): test_data = sp.lil_matrix([[0, 1], [1, 0], [1, 1], [1, 1]]) base_class = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=True, normalize_self_edges=True) edge_map = base_class.transform(test_data) self.assertEqual(len(edge_map), 3) self.assertEqual(edge_map[(0, 1)], 2) self.assertEqual(edge_map[(1, 1)], 1.5)
def test_edge_map_works_unweighted_non_self_edged_non_normalized(self): test_data = sp.lil_matrix([[0, 1], [1, 0], [1, 1], [1, 1]]) base_class = LabelCooccurrenceGraphBuilder(weighted=False, include_self_edges=False, normalize_self_edges=False) edge_map = base_class.transform(test_data) self.assertEqual(len(edge_map), 1) self.assertEqual(edge_map[(0, 1)], 1) self.assertNotIn((1, 1), edge_map)
def classifiers(self): graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False) param_dicts = { 'GraphFactorization': dict(epoch=1), 'GraRep': dict(Kstep=2), 'HOPE': dict(), 'LaplacianEigenmaps': dict(), 'LINE': dict(epoch=1, order=1), 'LLE': dict(), } if not (sys.version_info[0] == 2 or platform.architecture()[0] == '32bit'): for embedding in OpenNetworkEmbedder._EMBEDDINGS: if embedding == 'LLE': dimension = 3 else: dimension = 4 yield EmbeddingClassifier( OpenNetworkEmbedder(copy(graph_builder), embedding, dimension, 'add', True, param_dicts[embedding]), LinearRegression(), MLkNN(k=2)) yield EmbeddingClassifier( SKLearnEmbedder(SpectralEmbedding(n_components=2)), LinearRegression(), MLkNN(k=2)) EmbeddingClassifier(CLEMS(metrics.accuracy_score, True), LinearRegression(), MLkNN(k=2), True)
def supported_graphbuilder_generator(): for weighted in [True, False]: for include_self_edges in [True, False]: normalize_cases = [False] if weighted and include_self_edges: normalize_cases.append(True) for normalize_self_edges in normalize_cases: yield LabelCooccurrenceGraphBuilder( weighted=weighted, include_self_edges=include_self_edges, normalize_self_edges=normalize_self_edges)
def run_test2(normas, n_jobs=1): models = [[('tfidf', TfidfVectorizer(min_df=20, max_df=0.5))]] clfs = [{ 'clf': ('mv_mlp', MajorityVotingClassifier(classifier=MLPClassifier())), 'params': { 'mv_mlp__classifier__hidden_layer_sizes': [(150), (100, 100), (50, 50, 50)], 'mv_mlp__classifier__activation': ['tanh', 'relu'], 'mv_mlp__clusterer': [ NetworkXLabelGraphClusterer( LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'), NetworkXLabelGraphClusterer( LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa') ] } }] run(normas, models, clfs, n_jobs)
class LabelRank: """ A simple Python implementation of the LabelRank method proposed by Bin Fu (2018) in 'Learning label dependency for multi-label classification' """ def __init__(self, a=0.3, tol=0.01): self.a = a self.tol = tol self.T = None self.graph_builder = LabelCooccurrenceGraphBuilder( weighted=True, include_self_edges=False) def fit(self, y): if isinstance(y, list): y = np.array(y) edge_map = self.graph_builder.transform(y) W = np.zeros((y.shape[1], y.shape[1])) for target, source in edge_map: W[target][source] = edge_map[(target, source)] W[source][target] = edge_map[(target, source)] S = normalize(W, norm='l1', axis=0) self.T = normalize(S, norm='l1', axis=1) def transform(self, probas): if self.T is None: raise NotFittedError( 'Model is not fitted. Fit LabelRank model first.') probas = np.array(probas) transformed_probas = [] for proba in probas: diff = 1 p_x_t = proba while diff > self.tol: p_x_t_1 = self.a * self.T.dot(p_x_t) + ((1 - self.a) * proba) diff = sum(abs((p_x_t_1 - p_x_t) / p_x_t_1 * 100)) p_x_t = p_x_t_1 transformed_probas.append(p_x_t) return np.array(transformed_probas)
scoring_funcs = { "hamming loss": hamming_func, "aiming": aiming_func, "coverage": coverage_func, "accuracy": accuracy_func, "absolute true": absolute_true_func, "absolute false": absolute_false_func } # Keep recorded parameters = { 'classifier': [LabelPowerset()], 'classifier__classifier': [ExtraTreesClassifier()], 'classifier__classifier__n_estimators': [100, 500], 'clusterer': [ NetworkXLabelGraphClusterer( LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'), NetworkXLabelGraphClusterer( LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa') ] } ext = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv, scoring=scoring_funcs, verbose=0, refit="absolute true") ext.fit(X.T, Y.T)
@author: hamishgibbs """ from sklearn.ensemble import RandomForestClassifier from skmultilearn.problem_transform import LabelPowerset from skmultilearn.cluster import IGraphLabelGraphClusterer, LabelCooccurrenceGraphBuilder, StochasticBlockModel, GraphToolLabelGraphClusterer from skmultilearn.ensemble import LabelSpacePartitioningClassifier from skmultilearn.dataset import load_dataset X_train, y_train, feature_names, label_names = load_dataset( 'emotions', 'train') X_test, y_test, _, _ = load_dataset('emotions', 'test') #%% base_classifier = RandomForestClassifier(n_estimators=1000) #%% graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False) #%% model = StochasticBlockModel(nested=False, use_degree_correlation=True, allow_overlap=False, weight_model='real-normal') #%% problem_transform_classifier = LabelPowerset(classifier=base_classifier, require_dense=[False, False]) #%% clusterer = GraphToolLabelGraphClusterer(graph_builder=graph_builder, model=model) #%% classifier = LabelSpacePartitioningClassifier(problem_transform_classifier, clusterer)
def mlknn(train_tasks, test_tasks, train_meta, test_meta, x_spl, y_spl): with open('predictions_new_OpenNetworkEmbedder.pickle', 'rb') as f: data = pickle.load(f) predictions_new = data['predictions'] all_tags = data['all_tags'] all_tags = [] for problem in train_tasks: all_tags += problem['tags'] for problem in test_tasks: all_tags += problem['tags'] all_tags = list(set(all_tags)) x_train, y_train = transform_data(train_tasks, train_meta, all_tags) x_test, y_test = transform_data(test_tasks, test_meta, all_tags) vectorizer = TfidfVectorizer() vectorizer.fit(x_train) vectorizer.fit(x_test) x_train_transformed = vectorizer.transform(x_train) x_test_transformed = vectorizer.transform(x_test) x_train = lil_matrix(x_train_transformed).toarray() y_train = lil_matrix(y_train).toarray() x_test = lil_matrix(x_test_transformed).toarray() y_test = lil_matrix(y_test).toarray() from skmultilearn.embedding import SKLearnEmbedder, EmbeddingClassifier from sklearn.manifold import SpectralEmbedding from sklearn.ensemble import RandomForestRegressor from skmultilearn.adapt import MLkNN for n in (10000000, ): clf = EmbeddingClassifier( SKLearnEmbedder(SpectralEmbedding(n_components=10)), RandomForestRegressor(n_estimators=10), MLkNN(k=5)) clf.fit(x_train[:n], y_train[:n]) predictions = clf.predict(x_test) from sklearn.metrics import accuracy_score print("Accuracy = ", accuracy_score(y_test, predictions)) graph_builder = LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False) openne_line_params = dict(batch_size=1000, order=3) embedder = OpenNetworkEmbedder(graph_builder, 'LINE', dimension=5 * y_train.shape[1], aggregation_function='add', normalize_weights=True, param_dict=openne_line_params) clf = EmbeddingClassifier(embedder, RandomForestRegressor(n_estimators=10), MLkNN(k=5)) clf.fit(x_train, y_train) with open('model.pickle', 'wb') as f: pickle.dump({'all_tags': all_tags, 'model': clf}, f) predictions_new = clf.predict(x_test) # with open('predictions_new_OpenNetworkEmbedder.pickle', 'wb') as f: # pickle.dump({'all_tags': all_tags, 'predictions': predictions_new}, f) # with open('predictions_new_OpenNetworkEmbedder.pickle', 'rb') as f: # data = pickle.load(f) # predictions_new = data['predictions'] # all_tags_pickle = data['all_tags'] # # permutate y to be consistent with # y_test_new = y_test # return predictions_new, all for predicted, real in zip(predictions_new.toarray(), y_test): pt = [] rt = [] # for ap, aap in zip(predicted, all_tags_pickle): # if ap: # pt.append(aap) # for ar, aa in zip(real, all_tags): # if ar: # rt.append(aa) for ap, ar, aa in zip(predicted, real, all_tags): if ap: pt.append(aa) if ar: rt.append(aa) print('predicted', pt) print('real', rt) print('-' * 20, '\n') from sklearn.metrics import accuracy_score print("Accuracy = ", accuracy_score(y_test, predictions_new)) print("Accuracy = ", accuracy_score(predictions_new, y_test))
print(mlknn.best_score_) parameters = {'c_k': [2**i for i in range(-5, 5)]} mtsvm = GridSearchCV(MLTSVM(), param_grid=parameters, n_jobs=-1, cv=loocv, scoring=scoring_funcs, verbose=3, refit="absolute true") mtsvm.fit(X, Y.values) print(mtsvm.best_score_) parameters = { 'classifier': [LabelPowerset()], 'classifier__classifier': [ExtraTreesClassifier()], 'classifier__classifier__n_estimators': [50, 100, 500, 1000], 'clusterer' : [ NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'louvain'), NetworkXLabelGraphClusterer(LabelCooccurrenceGraphBuilder(weighted=True, include_self_edges=False), 'lpa') ] } ext = GridSearchCV(LabelSpacePartitioningClassifier(), param_grid=parameters, n_jobs=-1, cv=loocv, scoring=scoring_funcs, verbose=3, refit="absolute true") ext.fit(X, Y.values) print(ext.best_score_) parameters = { 'classifier': [LabelPowerset()], 'classifier__classifier': [RandomForestClassifier()], 'classifier__classifier__n_estimators': [50, 100, 500, 1000],
def __init__(self, a=0.3, tol=0.01): self.a = a self.tol = tol self.T = None self.graph_builder = LabelCooccurrenceGraphBuilder( weighted=True, include_self_edges=False)