def main(): from AnyQt.QtWidgets import QApplication t1 = fusion.ObjectType('Users', 10) t2 = fusion.ObjectType('Movies', 30) t3 = fusion.ObjectType('Actors', 40) # test that MeanFuser completes correctly R = np.ma.array([[1, 1, 0], [3, 0, 0]], mask=[[0, 0, 1], [0, 1, 1]], dtype=float) rel = fusion.Relation(R, t1, t2) assert (MeanFuser(0).complete(rel) == [[1, 1, 5 / 3], [3, 1, 5 / 3]]).all() assert (MeanFuser(1).complete(rel) == [[1, 1, 1], [3, 3, 3]]).all() assert (MeanFuser(2).complete(rel) == [[1, 1, 5 / 3], [3, 5 / 3, 5 / 3]]).all() R1 = np.ma.array(np.random.random((20, 20))) R2 = np.ma.array(np.random.random((40, 40)), mask=np.random.random((40, 40)) > .8) relations = [ fusion.Relation(R1, t1, t2, name='like'), fusion.Relation(R2, t3, t2, name='feature in'), ] G = fusion.FusionGraph() G.add_relations_from(relations) app = QApplication([]) w = OWMeanFuser() w.on_fusion_graph_change(G) w.show() app.exec()
def main(): # example from https://github.com/marinkaz/scikit-fusion import numpy as np from AnyQt.QtWidgets import QApplication R12 = np.random.rand(50, 100) R32 = np.random.rand(100, 150) R33 = np.random.rand(150, 150) R13 = np.random.rand(50, 150) t1 = fusion.ObjectType('Users', 10) t2 = fusion.ObjectType('Movies', 30) t3 = fusion.ObjectType('Actors', 40) relations = [ fusion.Relation(R12, t1, t2, name='like'), fusion.Relation(R13, t1, t3, name='are fans of'), fusion.Relation(R12, t1, t2, name='don\'t like'), fusion.Relation(R33, t3, t3, name='married to'), fusion.Relation(R32, t2, t3, name='feature') ] G = fusion.FusionGraph() for rel in relations: G.add_relation(rel) fuser = fusion.Dfmf() fuser.fuse(G) app = QApplication([]) w = OWChaining() w.on_fuser_change(FittedFusionGraph(fuser)) w.show() app.exec()
def main(): n_folds = 10 n_genes = dicty[gene][go_term][0].data.shape[0] cv = cross_validation.KFold(n_genes, n_folds=n_folds) fold_mse = np.zeros(n_folds) ann_mask = np.zeros_like(dicty[gene][go_term][0].data).astype('bool') relations = [ skf.Relation(dicty[gene][go_term][0].data, gene, go_term), skf.Relation(dicty[gene][exp_cond][0].data, gene, exp_cond), skf.Relation(dicty[gene][gene][0].data, gene, gene)] fusion_graph = skf.FusionGraph(relations) fuser = skf.Dfmc(max_iter=30, n_run=1, init_type='random', random_state=0) for i, (train_idx, test_idx) in enumerate(cv): ann_mask[:] = False ann_mask[test_idx, :] = True fusion_graph[gene][go_term][0].mask = ann_mask fuser.fuse(fusion_graph) pred_ann = fuser.complete(fuser.fusion_graph[gene][go_term][0])[test_idx] true_ann = dicty[gene][go_term][0].data[test_idx] fold_mse[i] = metrics.mean_squared_error(pred_ann, true_ann) print("MSE: %5.4f" % np.mean(fold_mse))
def main(): # example from https://github.com/marinkaz/scikit-fusion import numpy as np R12 = np.random.rand(50, 100) R32 = np.random.rand(150, 100) R33 = np.random.rand(150, 150) t1 = fusion.ObjectType('Users', 10) t2 = fusion.ObjectType('Movies', 30) t3 = fusion.ObjectType('Actors', 40) relations = [ fusion.Relation(R12, t1, t2, name='like'), fusion.Relation(R12, t1, t2, name='don\'t like'), fusion.Relation(R33, t3, t3, name='married to'), fusion.Relation(R32, t3, t2, name='play in') ] G = fusion.FusionGraph() for rel in relations: G.add_relation(rel) fuser = fusion.Dfmf() fuser.fuse(G) app = QtGui.QApplication([]) w = OWLatentFactors() w.on_fuser_change(FittedFusionGraph(fuser)) w.show() app.exec()
def commit(self): if self.data: domain = self.data.domain metadata_cols = list(domain.class_vars) + list(domain.metas) metadata = [{ var: var.to_val(value) for var, value in zip(metadata_cols, values.list) } for values in self.data[:, metadata_cols]] if self.transpose: relation = fusion.Relation( self.data.X.T, name=self.relation_name, row_type=fusion.ObjectType(self.col_type or 'Unknown'), row_names=self.col_names, col_type=fusion.ObjectType(self.row_type or 'Unknown'), col_names=self.row_names, col_metadata=metadata) else: relation = fusion.Relation( self.data.X, name=self.relation_name, row_type=fusion.ObjectType(self.row_type or 'Unknown'), row_names=self.row_names, row_metadata=metadata, col_type=fusion.ObjectType(self.col_type or 'Unknown'), col_names=self.col_names, ) self.Outputs.relation.send(Relation(relation))
def factorization(self): """ Matrix factorization, saves predictions to self.predictions and mask to self.mask """ print('\nDfmf') selected_features = self.selected_features mask = self.split_train_test(self.users_ratings, 0.2) R12 = self.users_ratings R23 = selected_features R14 = self.users t1 = fusion.ObjectType('Type 1', 10) t2 = fusion.ObjectType('Type 2', 10) t3 = fusion.ObjectType('Type 3', 10) t4 = fusion.ObjectType('UserData', 10) relations = [ fusion.Relation(R12, t1, t2, name='User ratings'), fusion.Relation(R23, t2, t3, name='Images'), fusion.Relation(R14, t1, t4, name='Users') ] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['User ratings'].mask = mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['User ratings']) self.predictions = R12_pred self.mask = mask self.true_values = R12
def mf(train_idx, test_idx, term_idx): ann = dicty[gene][go_term][0].data.copy() ann[test_idx, :] = 0 relations = [ skf.Relation(ann, gene, go_term), skf.Relation(dicty[gene][exp_cond][0].data, gene, exp_cond), skf.Relation(dicty[gene][gene][0].data, gene, gene) ] fusion_graph = skf.FusionGraph(relations) fuser = skf.Dfmf(max_iter=10, n_run=1, init_type="random_vcol", random_state=0) p = 0.7 gene.rank = p * dicty[gene][go_term][0].data.shape[0] exp_cond.rank = p * dicty[gene][exp_cond][0].data.shape[1] go_term.rank = p * dicty[gene][go_term][0].data.shape[1] fuser.fuse(fusion_graph) X = fuser.complete(fusion_graph[gene][exp_cond][0]) X_train = X[train_idx, :] y_train = dicty[gene][go_term][0].data[train_idx, term_idx] clf = ensemble.RandomForestClassifier(n_estimators=200) clf.fit(X_train, y_train) X_new = X[test_idx, :] y_pred = clf.predict_proba(X_new)[:, 1] return y_pred
def main(): from sklearn.datasets import make_blobs import numpy as np from AnyQt.QtWidgets import QApplication from orangecontrib.datafusion.models import FittedFusionGraph from orangecontrib.datafusion.widgets.owmeanfuser import MeanFuser X, y = make_blobs(100, 3, centers=2, center_box=(-100, 100), cluster_std=10) X = X.astype(int) X += abs(X.min()) nrows, ncols, _ = X.max(0) R1 = np.zeros((nrows + 1, ncols + 1)) R1[X[:, 0], X[:, 1]] = X[:, 2] R1 = np.ma.array((R1 - R1.min()) / (R1.max() - R1.min())) _, ncols, nrows = X.max(0) R2 = np.zeros((nrows + 1, ncols + 1)) R2[X[:, 2], X[:, 1]] = X[:, 0] R2 = np.ma.array((R2 - R2.min()) / (R2.max() - R2.min())) t1 = fusion.ObjectType('Users', 10) t2 = fusion.ObjectType('Movies', 30) t3 = fusion.ObjectType('Actors', 40) relations = [ fusion.Relation(R1, t1, t2, name='like'), fusion.Relation(R2, t3, t2, name='feature in'), ] G = fusion.FusionGraph() for relation in relations: relation.data.mask = np.random.rand(*relation.data.shape) > .8 G.add_relation(relation) fuserF = fusion.Dfmf() fuserF.fuse(G) from copy import deepcopy G = deepcopy(G) fuserC = fusion.Dfmc() fuserC.name = 'My dfmc<3' fuserC.fuse(G) app = QApplication([]) w = OWCompletionScoring() w.on_fuser_change(FittedFusionGraph(fuserF), fuserF.__class__.__name__) w.on_fuser_change(FittedFusionGraph(fuserC), fuserC.__class__.__name__) w.on_fuser_change(MeanFuser(0), 'meanfuser0') w.on_fuser_change(MeanFuser(1), 'meanfuser1') w.on_fuser_change(MeanFuser(2), 'meanfuser2') for i, relation in enumerate(relations, 1): w.on_relation_change(Relation(relation), i) w.show() app.exec()
def transform(fuser, test_idx): relations = [ skf.Relation(dicty[gene][exp_cond][0].data[test_idx, :], gene, exp_cond), skf.Relation(dicty[gene][gene][0].data[test_idx, :][:, test_idx], gene, gene) ] fusion_graph = skf.FusionGraph(relations) transformer = skf.DfmfTransform(max_iter=50, init_type="random_vcol") transformer.transform(gene, fusion_graph, fuser) return transformer
def fuse(train_idx): relations = [ skf.Relation(dicty[gene][go_term][0].data[train_idx, :], gene, go_term), skf.Relation(dicty[gene][exp_cond][0].data[train_idx, :], gene, exp_cond), skf.Relation(dicty[gene][gene][0].data[train_idx, :][:, train_idx], gene, gene) ] fusion_graph = skf.FusionGraph(relations) fuser = skf.Dfmf(max_iter=50, init_type="random_vcol") fuser.fuse(fusion_graph) return fuser, fusion_graph
def send_output(self): if self.movies is not None: movie_actor_mat, actors = movielens.movie_concept_matrix(self.movies, concept="actor", actors=self.percent) actor_actor_mat = movielens.actor_matrix(movie_actor_mat) movies_actors = fusion.Relation(movie_actor_mat.T, name='play in', row_type=movielens.ObjectType.Actors, row_names=actors, col_type=movielens.ObjectType.Movies, col_names=self.movies) self.Outputs.movie_actors.send(Relation(movies_actors)) actors_actors = fusion.Relation(actor_actor_mat, name='costar with', row_type=movielens.ObjectType.Actors, row_names=actors, col_type=movielens.ObjectType.Actors, col_names=actors) self.Outputs.actors_actors.send(Relation(actors_actors))
def create(cls, data, row_type, col_type, graph=None): row_names = row_metadata = col_names = col_metadata = None if row_type: if graph: row_names = graph.get_names(row_type) row_metadata = graph.get_metadata(row_type) if not any(row_metadata): row_metadata = None else: row_type = next(GENERATE_OTYPE) if col_type: if graph: col_names = graph.get_names(col_type) col_metadata = graph.get_metadata(row_type) if not any(col_metadata): col_metadata = None else: col_type = next(GENERATE_OTYPE), None return Relation( fusion.Relation(data, row_type, col_type, row_names=row_names, row_metadata=row_metadata, col_names=col_names, col_metadata=col_metadata))
def send_output(self): if self.data is not None: relation = fusion.Relation(self.matrix, name=self.relation_name, row_type=self.row_type, row_names=self.row_names, col_type=fusion.ObjectType("Genres"), col_names=self.genres) self.send("Genres", Relation(relation))
def transform(fuser, test_idx): pubmed_data = pharma[chemical][pmid][0].data[test_idx] depositor_data = pharma[chemical][depositor][0].data[test_idx] fingerprint_data = pharma[chemical][fingerprint][0].data[test_idx] chemical_data = pharma[chemical][chemical][0].data[test_idx, :][:, test_idx] relations = [ skf.Relation(pubmed_data, chemical, pmid), skf.Relation(depositor_data, chemical, depositor), skf.Relation(fingerprint_data, chemical, fingerprint), skf.Relation(chemical_data, chemical, chemical) ] fusion_graph = skf.FusionGraph(relations) transformer = skf.DfmfTransform(max_iter=200, init_type="random_vcol", random_state=0) transformer.transform(chemical, fusion_graph, fuser) return transformer
def factorization(self, cv_results_file): """ Matrix factorization, saves predictions to self.predictions and mask to self.mask :param cv_results_file: file for saving cv scores """ print('\nDfmf') mask = self.split_train_test(self.users_ratings, 0.2) R12 = self.users_ratings new_R12 = np.zeros(R12.shape) for i in range(R12.shape[0]): for j in range(R12.shape[1]): if R12[i][j] == 0: new_R12[i][j] = np.NaN else: new_R12[i][j] = R12[i][j] R12 = new_R12 # best_p_t1, best_p_t2, best_p_t3, best_p_t4 best_p_t1 = 100 best_p_t2 = 100 t = [6, 7, 8] parameters = [10, 50, 100, 200, 400] k = 3 #best_p_t1, best_p_t2, best_p_t3, best_p_t4, t = 70, 70, 8, 10, 6 #print(self.cross_validation(k, parameters, t, mask, R12, cv_results_file)) best_p_t1, best_p_t2, best_t = self.cross_validation( k, parameters, t, mask, R12, cv_results_file) print(str(best_p_t1) + ' ' + str(best_p_t2) + ' ' + str(best_t) + '\n') self.t = best_t # Predictions t1 = fusion.ObjectType('Type 1', best_p_t1) t2 = fusion.ObjectType('Type 2', best_p_t2) relations = [fusion.Relation(R12, t1, t2, name='Ratings')] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = mask.astype('bool') dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['Ratings']) self.predictions = R12_pred self.mask = mask self.true_values = R12
def main(): # example from https://github.com/marinkaz/scikit-fusion import numpy as np from AnyQt.QtWidgets import QApplication R12 = np.random.rand(50, 100) R22 = np.random.rand(100, 100) R13 = np.random.rand(50, 40) R31 = np.random.rand(40, 50) R23 = np.random.rand(100, 40) R23 = np.random.rand(100, 40) R24 = np.random.rand(100, 40) R34 = np.random.rand(40, 40) t1 = fusion.ObjectType('Users', 10) t2 = fusion.ObjectType('Actors', 20) t3 = fusion.ObjectType('Movies', 30) t4 = fusion.ObjectType('Genres', 40) relations = [ fusion.Relation(R12, t1, t2, name='like'), fusion.Relation(R13, t1, t3, name='rated'), fusion.Relation(R13, t1, t3, name='mated'), fusion.Relation(R23, t2, t3, name='play in'), fusion.Relation(R31, t3, t1), fusion.Relation(R24, t2, t4, name='prefer'), fusion.Relation(R34, t3, t4, name='belong to'), fusion.Relation(R22, t2, t2, name='married to') ] app = QApplication(['asdf']) w = OWFusionGraph() w.show() def _add_next_relation(event, id=iter(range(len(relations))), relation=iter(map(Relation, relations))): try: w.on_relation_change(next(relation), next(id)) except StopIteration: w.killTimer(w.timer_id) w.on_relation_change(None, 4) # Remove relation #4 w.timerEvent = _add_next_relation w.timer_id = w.startTimer(500) app.exec()
def send_output(self): if self.method == 0: matrix, movies, users = movielens.movie_user_matrix( percentage=self.percent) else: try: matrix, movies, users = movielens.movie_user_matrix( start_year=self.start, end_year=self.end) except ValueError: self.error(0, "Invalid starting years") self.Outputs.relation.send(None) relation = fusion.Relation(matrix.T, name='rate', row_type=movielens.ObjectType.Users, row_names=users, col_type=movielens.ObjectType.Movies, col_names=movies) self.Outputs.relation.send(Relation(relation))
def send_output(self): if self.method == 0: matrix, movies, users = movielens.movie_user_matrix( percentage=self.percent) else: try: matrix, movies, users = movielens.movie_user_matrix( start_year=self.start, end_year=self.end) except ValueError: self.error(0, "Invalid starting years") self.send("Ratings", None) def scale(X): return (X - np.nanmin(X)) / (np.nanmax(X) - np.nanmin(X)) relation = fusion.Relation(matrix.T, name='rate', row_type=movielens.ObjectType.Users, row_names=users, col_type=movielens.ObjectType.Movies, col_names=movies, preprocessor=scale) self.send("Ratings", Relation(relation))
def fit(self): self.types = dict( zip( self.nodes.keys(), map(lambda x: fusion.ObjectType(*x), self.nodes.items()), )) print(self.types) self.relations = map( lambda x: map( lambda r: fusion.Relation(r.values, self.types[x[0][0]], self. types[x[0][1]]), x[1], ), self.relation_definitions.items(), ) self.relations = list(chain(*self.relations)) print(self.relations) self.indices = {} for (src, dst), dfs in self.relation_definitions.items(): if not src in self.indices: self.indices[src] = list(dfs[0].index) if not dst in self.indices: self.indices[dst] = list(dfs[0].columns) random.seed(self.random_state) np.random.seed(self.random_state) self.fusion_graph = fusion.FusionGraph(self.relations) self.fuser = fusion.Dfmf(init_type=self.init_type, random_state=self.random_state, n_jobs=self.n_jobs) self.fuser.fuse(self.fusion_graph)
def fuse(train_idx): action_data = pharma[chemical][action][0].data[train_idx] pubmed_data = pharma[chemical][pmid][0].data[train_idx] depositor_data = pharma[chemical][depositor][0].data[train_idx] fingerprint_data = pharma[chemical][fingerprint][0].data[train_idx] depo_cat_data = pharma[depositor][depo_cat][0].data chemical_data = pharma[chemical][chemical][0].data[train_idx, :][:, train_idx] relations = [ skf.Relation(action_data, chemical, action), skf.Relation(pubmed_data, chemical, pmid), skf.Relation(depositor_data, chemical, depositor), skf.Relation(fingerprint_data, chemical, fingerprint), skf.Relation(depo_cat_data, depositor, depo_cat), skf.Relation(chemical_data, chemical, chemical) ] fusion_graph = skf.FusionGraph(relations) fuser = skf.Dfmf(max_iter=200, init_type="random_vcol", random_state=0) fuser.fuse(fusion_graph) return fuser
import numpy as np from skfusion import fusion R12 = np.random.rand(50, 100) R22 = np.random.rand(100, 100) R13 = np.random.rand(50, 40) R31 = np.random.rand(40, 50) R23 = np.random.rand(100, 40) R23 = np.random.rand(100, 40) R24 = np.random.rand(100, 400) R34 = np.random.rand(40, 400) t1 = fusion.ObjectType('Users', 10) t2 = fusion.ObjectType('Actors', 20) t3 = fusion.ObjectType('Movies', 30) t4 = fusion.ObjectType('Genres', 40) relations = [ fusion.Relation(R12, t1, t2, name='like'), fusion.Relation(R13, t1, t3, name='rated'), fusion.Relation(R13, t1, t3, name='mated'), fusion.Relation(R23, t2, t3, name='play in'), fusion.Relation(R31, t3, t1), fusion.Relation(R24, t2, t4, name='prefer'), fusion.Relation(R34, t3, t4, name='belong to'), fusion.Relation(R22, t2, t2, name='married to') ] for rel in relations: widget.addRelation(rel) sys.exit(app.exec_())
def cross_validation(self, k, parameters, parameters_t, mask, R12, results_file): """ Makes k masks for cv :param k: number of cv masks for each parameter combination :param parameters: array of parameters for cross validation :param mask: mask for primary test and train set :param R12: matrix for dfmf :param R23: matrix for dfmf :param R14: matrix for dfmf :param results_file: file for saving cv scores :returns: best_p_t1, best_p_t2, best_p_t3, best_p_t4 (best parameters) """ if path.exists(results_file): return self.load_results(results_file) cv_masks = self.get_cv_masks(self.users_ratings, mask, k) #best_cv_score = math.inf best_cv_score = 0 best_p_t1 = 0 best_p_t2 = 0 best_t = 0 all_p_t1 = [] all_p_t2 = [] all_t = [] all_scores = [] for p_t1 in parameters: for p_t2 in parameters: for t in parameters_t: scores = [] for current_cv_mask in cv_masks: t1 = fusion.ObjectType('Type 1', p_t1) t2 = fusion.ObjectType('Type 2', p_t2) relations = [ fusion.Relation(R12, t1, t2, name='Ratings') ] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = current_cv_mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['Ratings']) predictions = R12_pred mask = current_cv_mask true_values = R12 ratings_true = [] ratings_predicted = [] for i in range(predictions.shape[0]): for j in range(predictions.shape[1]): if mask[i][j]: ratings_true.append(true_values[i][j]) ratings_predicted.append(predictions[i][j]) new_ratings_true = [] new_ratings_predicted = [] for r_true, r_predicted in zip(ratings_true, ratings_predicted): if r_true > t: new_ratings_true.append(2) else: new_ratings_true.append(1) if r_predicted > t: new_ratings_predicted.append(2) else: new_ratings_predicted.append(1) ratings_true = new_ratings_true ratings_predicted = new_ratings_predicted ratings_true = np.asarray(ratings_true) ratings_predicted = np.asarray(ratings_predicted) # Rmse score = roc_auc_score(ratings_true, ratings_predicted) #score = rmse(ratings_true, ratings_predicted) #print('\nrmse: ' + str(score)) scores.append(score) score = sum(scores) / len(scores) all_p_t1.append(p_t1) all_p_t2.append(p_t2) all_t.append(t) all_scores.append(score) # Save best scores to a variable if score >= best_cv_score: best_cv_score = score best_p_t1 = p_t1 best_p_t2 = p_t2 best_t = t # Save cv scores to a csv file data = { 'p_t1': all_p_t1, 'p_t2': all_p_t2, 't': all_t, 'score': all_scores } df = pd.DataFrame(data, columns=['p_t1', 'p_t2', 't', 'score']) df.to_csv(results_file) return best_p_t1, best_p_t2, best_t
def factorization(self, cv_results_file, use_user_data=True): """ Matrix factorization, saves predictions to self.predictions and mask to self.mask :param cv_results_file: file for saving cv scores """ print('\nDfmf') selected_features = self.selected_features r = [] for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] != 0: r.append(self.users_ratings[i][j]) r.sort() mask = self.split_train_test(self.users_ratings, 0.2) R12 = self.users_ratings R23 = selected_features R14 = self.users new_R12 = np.zeros(self.users_ratings.shape) for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] == 0: new_R12[i][j] = np.NaN else: new_R12[i][j] = self.users_ratings[i][j] R12 = new_R12 if self.z_score: R12 = zscore(R12, axis=0, nan_policy='omit') # Parameters choice #parameters = [2, 4, 6, 8, 10] parameters_k1 = [10, 20, 30, 40, 50, 60, 70] parameters_k2 = [10, 20, 30, 40, 50, 60, 70] parameters_k3 = [2, 4, 6, 8, 10, 12] parameters_k4 = [2, 4, 6, 8, 10, 12] t = [4, 5, 6, 7, 8] parameters_k1 = [60, 70] parameters_k2 = [60, 70] parameters_k3 = [8, 10] parameters_k4 = [8, 10] t = [6, 7] k = 3 #best_p_t1, best_p_t2, best_p_t3, best_p_t4, t = 70, 70, 8, 10, 6 best_p_t1, best_p_t2, best_p_t3, best_p_t4, best_t = self.cross_validation( k, parameters_k1, parameters_k2, parameters_k3, parameters_k4, t, mask, R12, R23, R14, cv_results_file) print( str(best_p_t1) + ' ' + str(best_p_t2) + ' ' + str(best_p_t3) + ' ' + str(best_p_t4) + ' ' + str(best_t) + '\n') # Save best threshold for positive and negative class self.t = best_t # Predictions t1 = fusion.ObjectType('Type 1', best_p_t1) t2 = fusion.ObjectType('Type 2', best_p_t2) t3 = fusion.ObjectType('Type 3', best_p_t3) t4 = fusion.ObjectType('UserData', best_p_t4) if use_user_data: relations = [ fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images'), fusion.Relation(R14, t1, t4, name='Users') ] else: relations = [ fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images') ] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['Ratings']) self.predictions = R12_pred self.mask = mask self.true_values = R12
def cross_validation(self, k, parameters_k1, parameters_k2, parameters_k3, parameters_k4, parameters_t, mask, R12, R23, R14, results_file): """ Makes k masks for cv :param k: number of cv masks for each parameter combination :param parameters: array of parameters for cross validation :param mask: mask for primary test and train set :param R12: matrix for dfmf :param R23: matrix for dfmf :param R14: matrix for dfmf :param results_file: file for saving cv scores :returns: best_p_t1, best_p_t2, best_p_t3, best_p_t4 (best parameters) """ print('\nCross validation\n') if path.exists(results_file): p1, p2, p3, p4 = self.load_results(results_file) return p1, p2, p3, p4, 7 cv_masks = self.get_cv_masks(self.users_ratings, mask, k) new_R12 = np.zeros(self.users_ratings.shape) for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] == 0: new_R12[i][j] = np.NaN else: new_R12[i][j] = self.users_ratings[i][j] R12 = new_R12 if self.z_score: R12 = zscore(R12, axis=0) best_cv_score = 0 best_p_t1 = 0 best_p_t2 = 0 best_p_t3 = 0 best_p_t4 = 0 best_t = 0 all_p_t1 = [] all_p_t2 = [] all_p_t3 = [] all_p_t4 = [] all_t = [] all_scores = [] all_scores_rmse = [] for p_t1 in parameters_k1: for p_t2 in parameters_k2: for p_t3 in parameters_k3: for p_t4 in parameters_k4: for t in parameters_t: scores = [] scores_rmse = [] for current_cv_mask in cv_masks: t1 = fusion.ObjectType('Type 1', p_t1) t2 = fusion.ObjectType('Type 2', p_t2) t3 = fusion.ObjectType('Type 3', p_t3) t4 = fusion.ObjectType('UserData', p_t4) relations = [ fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images'), fusion.Relation(R14, t1, t4, name='Users') ] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = current_cv_mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete( fusion_graph['Ratings']) predictions = R12_pred mask = current_cv_mask true_values = R12 if self.z_score: new_predictions = np.zeros( predictions.shape) for i in range(predictions.shape[0]): for j in range(predictions.shape[1]): if predictions[i][j] == 0: new_predictions[i][j] = np.NaN else: new_predictions[i][ j] = predictions[i][j] a = np.asanyarray(new_predictions) mns = np.nanmean(a=a, axis=0, keepdims=True) sstd = np.nanstd(a=a, axis=0, keepdims=True) predictions = (a * sstd) + mns ratings_true = [] ratings_predicted = [] for i in range(predictions.shape[0]): for j in range(predictions.shape[1]): if mask[i][j]: ratings_true.append( true_values[i][j]) ratings_predicted.append( predictions[i][j]) new_ratings_true = [] new_ratings_predicted = [] for r_true, r_predicted in zip( ratings_true, ratings_predicted): if r_true > t: new_ratings_true.append(2) else: new_ratings_true.append(1) if r_predicted > t: new_ratings_predicted.append(2) else: new_ratings_predicted.append(1) ratings_true = new_ratings_true ratings_predicted = new_ratings_predicted ratings_true = np.asarray(ratings_true) ratings_predicted = np.asarray( ratings_predicted) # Score score = roc_auc_score(ratings_true, ratings_predicted) score_rmse = rmse(ratings_true, ratings_predicted) scores.append(score) scores_rmse.append(score_rmse) score = sum(scores) / len(scores) score_rmse = sum(scores_rmse) / len(scores_rmse) all_p_t1.append(p_t1) all_p_t2.append(p_t2) all_p_t3.append(p_t3) all_p_t4.append(p_t4) all_t.append(t) all_scores.append(score) all_scores_rmse.append(score_rmse) # Save best scores to a variable if score >= best_cv_score: best_cv_score = score best_p_t1 = p_t1 best_p_t2 = p_t2 best_p_t3 = p_t3 best_p_t4 = p_t4 best_t = t # Save cv scores to a csv file data = { 'p_t1': all_p_t1, 'p_t2': all_p_t2, 'p_t3': all_p_t3, 'p_t4': all_p_t4, 't': all_t, 'score': all_scores, 'rmse': all_scores_rmse } df = pd.DataFrame( data, columns=['p_t1', 'p_t2', 'p_t3', 'p_t4', 't', 'score', 'rmse']) df.to_csv(results_file) return best_p_t1, best_p_t2, best_p_t3, best_p_t4, best_t
for line in fin: try: yield line.split(delimiter, 1)[1] except IndexError: continue t1 = fusion.ObjectType('Type 1', nfactors) tdata = [ fusion.ObjectType(dataset, nfactors) for dataset in datasets ] relations = [] for i in range(len(datasets)): relations.append( fusion.Relation( np.transpose( np.loadtxt( strip_first_col(os.path.join(source_folder, datasets[i])), delimiter=sep, skiprows=1) ), t1, tdata[i] ) ) fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) print(fusion_graph) fuser = fusion.Dfmf() fuser.fuse(fusion_graph) np.savetxt(os.path.join(output_folder,"signals.txt"), fuser.factor(t1), delimiter='\t') for i in range(len(datasets)): np.savetxt(os.path.join(output_folder, "proj%s" %datasets[i]), fuser.factor(tdata[i]), delimiter='\t')
def factorization(self, cv_results_file, use_user_data=True): """ Matrix factorization, saves predictions to self.predictions and mask to self.mask :param cv_results_file: file for saving cv scores """ print('\nDfmf') selected_features = self.selected_features r = [] for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] != 0: r.append(self.users_ratings[i][j]) r.sort() t = r[round(len(r)/2)] self.t = t mask = self.split_train_test(self.users_ratings, 0.2) R12 = self.users_ratings R23 = selected_features R14 = self.users new_R12 = np.zeros(self.users_ratings.shape) for i in range(self.users_ratings.shape[0]): for j in range(self.users_ratings.shape[1]): if self.users_ratings[i][j] == 0: new_R12[i][j] = np.NaN else: new_R12[i][j] = self.users_ratings[i][j] R12 = new_R12 if self.z_score: R12 = zscore(R12, axis=0) # Parameters choice print('\nParameters\n') #parameters = [2, 4, 6, 8, 10] parameters = [2, 4, 6, 8, 10, 12, 14, 16, 18] k = 3 best_p_t1, best_p_t2, best_p_t3, best_p_t4 = self.cross_validation(k, parameters, mask, R12, R23, R14, cv_results_file) print(str(best_p_t1) + ' ' + str(best_p_t2) + ' ' + str(best_p_t3) + ' ' + str(best_p_t4) + '\n') # Predictions t1 = fusion.ObjectType('Type 1', best_p_t1) t2 = fusion.ObjectType('Type 2', best_p_t2) t3 = fusion.ObjectType('Type 3', best_p_t3) t4 = fusion.ObjectType('UserData', best_p_t4) if use_user_data: relations = [fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images'), fusion.Relation(R14, t1, t4, name='Users')] else: relations = [fusion.Relation(R12, t1, t2, name='Ratings'), fusion.Relation(R23, t2, t3, name='Images')] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf(init_type="random_vcol") fusion_graph['Ratings'].mask = mask dfmf_mod = fuser.fuse(fusion_graph) R12_pred = dfmf_mod.complete(fusion_graph['Ratings']) self.predictions = R12_pred self.mask = mask self.true_values = R12
import numpy as np from skfusion import fusion from skfusion import datasets R12 = np.random.rand(50, 100) R13 = np.random.rand(50, 40) R23 = np.random.rand(100, 40) t1 = fusion.ObjectType('Type 1', 10) t2 = fusion.ObjectType('Type 2', 20) t3 = fusion.ObjectType('Type 3', 30) relations = [fusion.Relation(R12, t1, t2), fusion.Relation(R13, t1, t3), fusion.Relation(R23, t2, t3)] fusion_graph = fusion.FusionGraph() fusion_graph.add_relations_from(relations) fuser = fusion.Dfmf() fuser.fuse(fusion_graph) print(fuser.factor(t1).shape) new_R12 = np.random.rand(10, 100) new_R13 = np.random.rand(10, 40) new_relations = [fusion.Relation(new_R12, t1, t2), fusion.Relation(new_R13, t1, t3)] new_graph = fusion.FusionGraph(new_relations) transformer = fusion.DfmfTransform() transformer.transform(t1, new_graph, fuser) print(transformer.factor(t1).shape)