def main(): # example from https://github.com/marinkaz/scikit-fusion import numpy as np R12 = np.random.rand(50, 100) R32 = np.random.rand(150, 100) R33 = np.random.rand(150, 150) t1 = fusion.ObjectType('Users', 10) t2 = fusion.ObjectType('Movies', 30) t3 = fusion.ObjectType('Actors', 40) relations = [ fusion.Relation(R12, t1, t2, name='like'), fusion.Relation(R12, t1, t2, name='don\'t like'), fusion.Relation(R33, t3, t3, name='married to'), fusion.Relation(R32, t3, t2, name='play in') ] G = fusion.FusionGraph() for rel in relations: G.add_relation(rel) fuser = fusion.Dfmf() fuser.fuse(G) app = QtGui.QApplication([]) w = OWLatentFactors() w.on_fuser_change(FittedFusionGraph(fuser)) w.show() app.exec()
def main(): # example from https://github.com/marinkaz/scikit-fusion import numpy as np from AnyQt.QtWidgets import QApplication R12 = np.random.rand(50, 100) R32 = np.random.rand(100, 150) R33 = np.random.rand(150, 150) R13 = np.random.rand(50, 150) t1 = fusion.ObjectType('Users', 10) t2 = fusion.ObjectType('Movies', 30) t3 = fusion.ObjectType('Actors', 40) relations = [ fusion.Relation(R12, t1, t2, name='like'), fusion.Relation(R13, t1, t3, name='are fans of'), fusion.Relation(R12, t1, t2, name='don\'t like'), fusion.Relation(R33, t3, t3, name='married to'), fusion.Relation(R32, t2, t3, name='feature') ] G = fusion.FusionGraph() for rel in relations: G.add_relation(rel) fuser = fusion.Dfmf() fuser.fuse(G) app = QApplication([]) w = OWChaining() w.on_fuser_change(FittedFusionGraph(fuser)) w.show() app.exec()
def factorization(self): """ Matrix factorization, saves predictions to self.predictions and mask to self.mask """ print('\nDfmf') selected_features = self.selected_features mask = self.split_train_test(self.users_ratings, 0.2) R12 = self.users_ratings R23 = selected_features R14 = self.users t1 = fusion.ObjectType('Type 1', 10) t2 = fusion.ObjectType('Type 2', 10) t3 = fusion.ObjectType('Type 3', 10) t4 = fusion.ObjectType('UserData', 10) relations = [ fusion.Relation(R12, t1, t2, name='User ratings'), fusion.Relation(R23, t2, t3, name='Images'), fusion.Relation(R14, t1, t4, name='Users') ] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['User ratings'].mask = mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['User ratings']) self.predictions = R12_pred self.mask = mask self.true_values = R12
def mf(train_idx, test_idx, term_idx): ann = dicty[gene][go_term][0].data.copy() ann[test_idx, :] = 0 relations = [ skf.Relation(ann, gene, go_term), skf.Relation(dicty[gene][exp_cond][0].data, gene, exp_cond), skf.Relation(dicty[gene][gene][0].data, gene, gene) ] fusion_graph = skf.FusionGraph(relations) fuser = skf.Dfmf(max_iter=10, n_run=1, init_type="random_vcol", random_state=0) p = 0.7 gene.rank = p * dicty[gene][go_term][0].data.shape[0] exp_cond.rank = p * dicty[gene][exp_cond][0].data.shape[1] go_term.rank = p * dicty[gene][go_term][0].data.shape[1] fuser.fuse(fusion_graph) X = fuser.complete(fusion_graph[gene][exp_cond][0]) X_train = X[train_idx, :] y_train = dicty[gene][go_term][0].data[train_idx, term_idx] clf = ensemble.RandomForestClassifier(n_estimators=200) clf.fit(X_train, y_train) X_new = X[test_idx, :] y_pred = clf.predict_proba(X_new)[:, 1] return y_pred
def main(): from sklearn.datasets import make_blobs import numpy as np from AnyQt.QtWidgets import QApplication from orangecontrib.datafusion.models import FittedFusionGraph from orangecontrib.datafusion.widgets.owmeanfuser import MeanFuser X, y = make_blobs(100, 3, centers=2, center_box=(-100, 100), cluster_std=10) X = X.astype(int) X += abs(X.min()) nrows, ncols, _ = X.max(0) R1 = np.zeros((nrows + 1, ncols + 1)) R1[X[:, 0], X[:, 1]] = X[:, 2] R1 = np.ma.array((R1 - R1.min()) / (R1.max() - R1.min())) _, ncols, nrows = X.max(0) R2 = np.zeros((nrows + 1, ncols + 1)) R2[X[:, 2], X[:, 1]] = X[:, 0] R2 = np.ma.array((R2 - R2.min()) / (R2.max() - R2.min())) t1 = fusion.ObjectType('Users', 10) t2 = fusion.ObjectType('Movies', 30) t3 = fusion.ObjectType('Actors', 40) relations = [ fusion.Relation(R1, t1, t2, name='like'), fusion.Relation(R2, t3, t2, name='feature in'), ] G = fusion.FusionGraph() for relation in relations: relation.data.mask = np.random.rand(*relation.data.shape) > .8 G.add_relation(relation) fuserF = fusion.Dfmf() fuserF.fuse(G) from copy import deepcopy G = deepcopy(G) fuserC = fusion.Dfmc() fuserC.name = 'My dfmc<3' fuserC.fuse(G) app = QApplication([]) w = OWCompletionScoring() w.on_fuser_change(FittedFusionGraph(fuserF), fuserF.__class__.__name__) w.on_fuser_change(FittedFusionGraph(fuserC), fuserC.__class__.__name__) w.on_fuser_change(MeanFuser(0), 'meanfuser0') w.on_fuser_change(MeanFuser(1), 'meanfuser1') w.on_fuser_change(MeanFuser(2), 'meanfuser2') for i, relation in enumerate(relations, 1): w.on_relation_change(Relation(relation), i) w.show() app.exec()
def factorization(self, cv_results_file): """ Matrix factorization, saves predictions to self.predictions and mask to self.mask :param cv_results_file: file for saving cv scores """ print('\nDfmf') mask = self.split_train_test(self.users_ratings, 0.2) R12 = self.users_ratings new_R12 = np.zeros(R12.shape) for i in range(R12.shape[0]): for j in range(R12.shape[1]): if R12[i][j] == 0: new_R12[i][j] = np.NaN else: new_R12[i][j] = R12[i][j] R12 = new_R12 # best_p_t1, best_p_t2, best_p_t3, best_p_t4 best_p_t1 = 100 best_p_t2 = 100 t = [6, 7, 8] parameters = [10, 50, 100, 200, 400] k = 3 #best_p_t1, best_p_t2, best_p_t3, best_p_t4, t = 70, 70, 8, 10, 6 #print(self.cross_validation(k, parameters, t, mask, R12, cv_results_file)) best_p_t1, best_p_t2, best_t = self.cross_validation( k, parameters, t, mask, R12, cv_results_file) print(str(best_p_t1) + ' ' + str(best_p_t2) + ' ' + str(best_t) + '\n') self.t = best_t # Predictions t1 = fusion.ObjectType('Type 1', best_p_t1) t2 = fusion.ObjectType('Type 2', best_p_t2) relations = [fusion.Relation(R12, t1, t2, name='Ratings')] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = mask.astype('bool') dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['Ratings']) self.predictions = R12_pred self.mask = mask self.true_values = R12
def fuse(train_idx): relations = [ skf.Relation(dicty[gene][go_term][0].data[train_idx, :], gene, go_term), skf.Relation(dicty[gene][exp_cond][0].data[train_idx, :], gene, exp_cond), skf.Relation(dicty[gene][gene][0].data[train_idx, :][:, train_idx], gene, gene) ] fusion_graph = skf.FusionGraph(relations) fuser = skf.Dfmf(max_iter=50, init_type="random_vcol") fuser.fuse(fusion_graph) return fuser, fusion_graph
def fuse(train_idx): action_data = pharma[chemical][action][0].data[train_idx] pubmed_data = pharma[chemical][pmid][0].data[train_idx] depositor_data = pharma[chemical][depositor][0].data[train_idx] fingerprint_data = pharma[chemical][fingerprint][0].data[train_idx] depo_cat_data = pharma[depositor][depo_cat][0].data chemical_data = pharma[chemical][chemical][0].data[train_idx, :][:, train_idx] relations = [ skf.Relation(action_data, chemical, action), skf.Relation(pubmed_data, chemical, pmid), skf.Relation(depositor_data, chemical, depositor), skf.Relation(fingerprint_data, chemical, fingerprint), skf.Relation(depo_cat_data, depositor, depo_cat), skf.Relation(chemical_data, chemical, chemical) ] fusion_graph = skf.FusionGraph(relations) fuser = skf.Dfmf(max_iter=200, init_type="random_vcol", random_state=0) fuser.fuse(fusion_graph) return fuser
def fit(self): self.types = dict( zip( self.nodes.keys(), map(lambda x: fusion.ObjectType(*x), self.nodes.items()), )) print(self.types) self.relations = map( lambda x: map( lambda r: fusion.Relation(r.values, self.types[x[0][0]], self. types[x[0][1]]), x[1], ), self.relation_definitions.items(), ) self.relations = list(chain(*self.relations)) print(self.relations) self.indices = {} for (src, dst), dfs in self.relation_definitions.items(): if not src in self.indices: self.indices[src] = list(dfs[0].index) if not dst in self.indices: self.indices[dst] = list(dfs[0].columns) random.seed(self.random_state) np.random.seed(self.random_state) self.fusion_graph = fusion.FusionGraph(self.relations) self.fuser = fusion.Dfmf(init_type=self.init_type, random_state=self.random_state, n_jobs=self.n_jobs) self.fuser.fuse(self.fusion_graph)
def factorization(self, cv_results_file, use_user_data=True): """ Matrix factorization, saves predictions to self.predictions and mask to self.mask :param cv_results_file: file for saving cv scores """ print('\nDfmf') selected_features = self.selected_features r = [] for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] != 0: r.append(self.users_ratings[i][j]) r.sort() mask = self.split_train_test(self.users_ratings, 0.2) R12 = self.users_ratings R23 = selected_features R14 = self.users new_R12 = np.zeros(self.users_ratings.shape) for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] == 0: new_R12[i][j] = np.NaN else: new_R12[i][j] = self.users_ratings[i][j] R12 = new_R12 if self.z_score: R12 = zscore(R12, axis=0, nan_policy='omit') # Parameters choice #parameters = [2, 4, 6, 8, 10] parameters_k1 = [10, 20, 30, 40, 50, 60, 70] parameters_k2 = [10, 20, 30, 40, 50, 60, 70] parameters_k3 = [2, 4, 6, 8, 10, 12] parameters_k4 = [2, 4, 6, 8, 10, 12] t = [4, 5, 6, 7, 8] parameters_k1 = [60, 70] parameters_k2 = [60, 70] parameters_k3 = [8, 10] parameters_k4 = [8, 10] t = [6, 7] k = 3 #best_p_t1, best_p_t2, best_p_t3, best_p_t4, t = 70, 70, 8, 10, 6 best_p_t1, best_p_t2, best_p_t3, best_p_t4, best_t = self.cross_validation( k, parameters_k1, parameters_k2, parameters_k3, parameters_k4, t, mask, R12, R23, R14, cv_results_file) print( str(best_p_t1) + ' ' + str(best_p_t2) + ' ' + str(best_p_t3) + ' ' + str(best_p_t4) + ' ' + str(best_t) + '\n') # Save best threshold for positive and negative class self.t = best_t # Predictions t1 = fusion.ObjectType('Type 1', best_p_t1) t2 = fusion.ObjectType('Type 2', best_p_t2) t3 = fusion.ObjectType('Type 3', best_p_t3) t4 = fusion.ObjectType('UserData', best_p_t4) if use_user_data: relations = [ fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images'), fusion.Relation(R14, t1, t4, name='Users') ] else: relations = [ fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images') ] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['Ratings']) self.predictions = R12_pred self.mask = mask self.true_values = R12
def cross_validation(self, k, parameters_k1, parameters_k2, parameters_k3, parameters_k4, parameters_t, mask, R12, R23, R14, results_file): """ Makes k masks for cv :param k: number of cv masks for each parameter combination :param parameters: array of parameters for cross validation :param mask: mask for primary test and train set :param R12: matrix for dfmf :param R23: matrix for dfmf :param R14: matrix for dfmf :param results_file: file for saving cv scores :returns: best_p_t1, best_p_t2, best_p_t3, best_p_t4 (best parameters) """ print('\nCross validation\n') if path.exists(results_file): p1, p2, p3, p4 = self.load_results(results_file) return p1, p2, p3, p4, 7 cv_masks = self.get_cv_masks(self.users_ratings, mask, k) new_R12 = np.zeros(self.users_ratings.shape) for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] == 0: new_R12[i][j] = np.NaN else: new_R12[i][j] = self.users_ratings[i][j] R12 = new_R12 if self.z_score: R12 = zscore(R12, axis=0) best_cv_score = 0 best_p_t1 = 0 best_p_t2 = 0 best_p_t3 = 0 best_p_t4 = 0 best_t = 0 all_p_t1 = [] all_p_t2 = [] all_p_t3 = [] all_p_t4 = [] all_t = [] all_scores = [] all_scores_rmse = [] for p_t1 in parameters_k1: for p_t2 in parameters_k2: for p_t3 in parameters_k3: for p_t4 in parameters_k4: for t in parameters_t: scores = [] scores_rmse = [] for current_cv_mask in cv_masks: t1 = fusion.ObjectType('Type 1', p_t1) t2 = fusion.ObjectType('Type 2', p_t2) t3 = fusion.ObjectType('Type 3', p_t3) t4 = fusion.ObjectType('UserData', p_t4) relations = [ fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images'), fusion.Relation(R14, t1, t4, name='Users') ] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = current_cv_mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete( fusion_graph['Ratings']) predictions = R12_pred mask = current_cv_mask true_values = R12 if self.z_score: new_predictions = np.zeros( predictions.shape) for i in range(predictions.shape[0]): for j in range(predictions.shape[1]): if predictions[i][j] == 0: new_predictions[i][j] = np.NaN else: new_predictions[i][ j] = predictions[i][j] a = np.asanyarray(new_predictions) mns = np.nanmean(a=a, axis=0, keepdims=True) sstd = np.nanstd(a=a, axis=0, keepdims=True) predictions = (a * sstd) + mns ratings_true = [] ratings_predicted = [] for i in range(predictions.shape[0]): for j in range(predictions.shape[1]): if mask[i][j]: ratings_true.append( true_values[i][j]) ratings_predicted.append( predictions[i][j]) new_ratings_true = [] new_ratings_predicted = [] for r_true, r_predicted in zip( ratings_true, ratings_predicted): if r_true > t: new_ratings_true.append(2) else: new_ratings_true.append(1) if r_predicted > t: new_ratings_predicted.append(2) else: new_ratings_predicted.append(1) ratings_true = new_ratings_true ratings_predicted = new_ratings_predicted ratings_true = np.asarray(ratings_true) ratings_predicted = np.asarray( ratings_predicted) # Score score = roc_auc_score(ratings_true, ratings_predicted) score_rmse = rmse(ratings_true, ratings_predicted) scores.append(score) scores_rmse.append(score_rmse) score = sum(scores) / len(scores) score_rmse = sum(scores_rmse) / len(scores_rmse) all_p_t1.append(p_t1) all_p_t2.append(p_t2) all_p_t3.append(p_t3) all_p_t4.append(p_t4) all_t.append(t) all_scores.append(score) all_scores_rmse.append(score_rmse) # Save best scores to a variable if score >= best_cv_score: best_cv_score = score best_p_t1 = p_t1 best_p_t2 = p_t2 best_p_t3 = p_t3 best_p_t4 = p_t4 best_t = t # Save cv scores to a csv file data = { 'p_t1': all_p_t1, 'p_t2': all_p_t2, 'p_t3': all_p_t3, 'p_t4': all_p_t4, 't': all_t, 'score': all_scores, 'rmse': all_scores_rmse } df = pd.DataFrame( data, columns=['p_t1', 'p_t2', 'p_t3', 'p_t4', 't', 'score', 'rmse']) df.to_csv(results_file) return best_p_t1, best_p_t2, best_p_t3, best_p_t4, best_t
def factorization(self, cv_results_file, use_user_data=True): """ Matrix factorization, saves predictions to self.predictions and mask to self.mask :param cv_results_file: file for saving cv scores """ print('\nDfmf') selected_features = self.selected_features r = [] for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] != 0: r.append(self.users_ratings[i][j]) r.sort() t = r[round(len(r)/2)] self.t = t mask = self.split_train_test(self.users_ratings, 0.2) R12 = self.users_ratings R23 = selected_features R14 = self.users new_R12 = np.zeros(self.users_ratings.shape) for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] == 0: new_R12[i][j] = np.NaN else: new_R12[i][j] = self.users_ratings[i][j] R12 = new_R12 if self.z_score: R12 = zscore(R12, axis=0) # Parameters choice print('\nParameters\n') #parameters = [2, 4, 6, 8, 10] parameters = [2, 4, 6, 8, 10, 12, 14, 16, 18] k = 3 best_p_t1, best_p_t2, best_p_t3, best_p_t4 = self.cross_validation(k, parameters, mask, R12, R23, R14, cv_results_file) print(str(best_p_t1) + ' ' + str(best_p_t2) + ' ' + str(best_p_t3) + ' ' + str(best_p_t4) + '\n') # Predictions t1 = fusion.ObjectType('Type 1', best_p_t1) t2 = fusion.ObjectType('Type 2', best_p_t2) t3 = fusion.ObjectType('Type 3', best_p_t3) t4 = fusion.ObjectType('UserData', best_p_t4) if use_user_data: relations = [fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images'), fusion.Relation(R14, t1, t4, name='Users')] else: relations = [fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images')] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['Ratings']) self.predictions = R12_pred self.mask = mask self.true_values = R12
from skfusion import fusion from skfusion import datasets R12 = np.random.rand(50, 100) R13 = np.random.rand(50, 40) R23 = np.random.rand(100, 40) t1 = fusion.ObjectType('Type 1', 10) t2 = fusion.ObjectType('Type 2', 20) t3 = fusion.ObjectType('Type 3', 30) relations = [fusion.Relation(R12, t1, t2), fusion.Relation(R13, t1, t3), fusion.Relation(R23, t2, t3)] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf() fuser.fuse(fusion_graph) print(fuser.factor(t1).shape) new_R12 = np.random.rand(10, 100) new_R13 = np.random.rand(10, 40) new_relations = [fusion.Relation(new_R12, t1, t2), fusion.Relation(new_R13, t1, t3)] new_graph = fusion.FusionGraph(new_relations) transformer = fusion.DfmfTransform() transformer.transform(t1, new_graph, fuser) print(transformer.factor(t1).shape) dicty = datasets.load_dicty()
def cross_validation(self, k, parameters, parameters_t, mask, R12, results_file): """ Makes k masks for cv :param k: number of cv masks for each parameter combination :param parameters: array of parameters for cross validation :param mask: mask for primary test and train set :param R12: matrix for dfmf :param R23: matrix for dfmf :param R14: matrix for dfmf :param results_file: file for saving cv scores :returns: best_p_t1, best_p_t2, best_p_t3, best_p_t4 (best parameters) """ if path.exists(results_file): return self.load_results(results_file) cv_masks = self.get_cv_masks(self.users_ratings, mask, k) #best_cv_score = math.inf best_cv_score = 0 best_p_t1 = 0 best_p_t2 = 0 best_t = 0 all_p_t1 = [] all_p_t2 = [] all_t = [] all_scores = [] for p_t1 in parameters: for p_t2 in parameters: for t in parameters_t: scores = [] for current_cv_mask in cv_masks: t1 = fusion.ObjectType('Type 1', p_t1) t2 = fusion.ObjectType('Type 2', p_t2) relations = [ fusion.Relation(R12, t1, t2, name='Ratings') ] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = current_cv_mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['Ratings']) predictions = R12_pred mask = current_cv_mask true_values = R12 ratings_true = [] ratings_predicted = [] for i in range(predictions.shape[0]): for j in range(predictions.shape[1]): if mask[i][j]: ratings_true.append(true_values[i][j]) ratings_predicted.append(predictions[i][j]) new_ratings_true = [] new_ratings_predicted = [] for r_true, r_predicted in zip(ratings_true, ratings_predicted): if r_true > t: new_ratings_true.append(2) else: new_ratings_true.append(1) if r_predicted > t: new_ratings_predicted.append(2) else: new_ratings_predicted.append(1) ratings_true = new_ratings_true ratings_predicted = new_ratings_predicted ratings_true = np.asarray(ratings_true) ratings_predicted = np.asarray(ratings_predicted) # Rmse score = roc_auc_score(ratings_true, ratings_predicted) #score = rmse(ratings_true, ratings_predicted) #print('\nrmse: ' + str(score)) scores.append(score) score = sum(scores) / len(scores) all_p_t1.append(p_t1) all_p_t2.append(p_t2) all_t.append(t) all_scores.append(score) # Save best scores to a variable if score >= best_cv_score: best_cv_score = score best_p_t1 = p_t1 best_p_t2 = p_t2 best_t = t # Save cv scores to a csv file data = { 'p_t1': all_p_t1, 'p_t2': all_p_t2, 't': all_t, 'score': all_scores } df = pd.DataFrame(data, columns=['p_t1', 'p_t2', 't', 'score']) df.to_csv(results_file) return best_p_t1, best_p_t2, best_t