def __init__(self, data_file_name=None, dataset_type=IRIS, algorithm_mode=DIRECT): self.data_file_name = data_file_name if data_file_name else "dataset/iris.csv" self.simplex_tree = None self.algorithm_mode = algorithm_mode self.dataset_handler = DatasetHandler(dataset_type, 4) self.classifier_evaluator = None self.filtrations = None self.simplex_tree = None self.complex = None self.memory = None
def __init__(self): self.dh = DatasetHandler() self._utility_mat = None
class TDABasedClassifier: def __init__(self, data_file_name=None, dataset_type=IRIS, algorithm_mode=DIRECT): self.data_file_name = data_file_name if data_file_name else "dataset/iris.csv" self.simplex_tree = None self.algorithm_mode = algorithm_mode self.dataset_handler = DatasetHandler(dataset_type, 4) self.classifier_evaluator = None self.filtrations = None self.simplex_tree = None self.complex = None self.memory = None def init_data(self): self.dataset_handler.load_dataset() def split_dataset(self, k=None, j=None): self.dataset_handler.split_dataset(k, fold_position=j) def unify_dataset(self): return self.dataset_handler.unify_dataset() def destroy(self): if self.filtrations: del self.filtrations self.filtrations = None if self.simplex_tree: del self.simplex_tree self.simplex_tree = None if self.complex: del self.complex self.complex = None self.dataset_handler.clean() ''' get_link calcula el link(sigma) debido a que gudhi no computa esta funcion ''' def get_link(self, sigma): """ as gudhi SimplexTree dont have link method :param sigma: :return: """ if self.simplex_tree is None: return set() link = set() if not (type(sigma) == list or type(sigma) == tuple): sigma = [sigma] try: size = len(sigma) _star = self.simplex_tree.get_star(sigma) for simplex, _ in _star: # _ is the filtration value, its not necessary here # if len(sigma)-size == 1: simplex = set(simplex).difference(sigma) link = link.union(simplex) del _star except BaseException as e: print("ERROR en get_lik: {0}".format(e)) print("link({0}) = {1}".format(sigma, link)) return link ''' Psi es la funcion de asignacion que hace corresponder un conjunto de etiquetas t \in P(T) a cada simplice sigma \in K ''' def Psi(self, sigma): if sigma is None: return [] if not type(sigma) == list or not type(sigma) == tuple: sigma_key = str([sigma]) else: sigma_key = str(sigma) if sigma_key in self.dataset_handler.tags_training: t = self.dataset_handler.tags_training[sigma_key] return t if type(t) in [list, tuple, dict, np.ndarray] else [ t ] # then t \neq None this may occure when ksimplex \in S, # or the computation was completed before card = self.Card(sigma) # here we need to compute associations self.dataset_handler.tags_training.update({sigma_key: []}) result = [] if card == 1: # then ksimplex \in X and t = None link = self.get_link(sigma) for tau in link: psi_val = self.Psi(tau) result.extend(psi_val) else: for tau in sigma: psi_val = self.Psi(tau) result.extend(psi_val) self.dataset_handler.tags_training.update({sigma_key: result}) return result def Card(self, sigma): return len(sigma) if type(sigma) == list or type(sigma) == tuple else 1 ''' La funcion Gamma retorna un vector V, donde cada elemento v_i \in V representa la cantidad de apariciones (o votos) obtenidos por la etiqueta t_i \in T durante el calculo de Psi(\sigma). ''' def Gamma(self, sigma): card = self.Card(sigma) size_tags = len(self.dataset_handler.tags_set) V = [0] * size_tags if card == 1: _tags = self.Psi(sigma) for t in _tags: # como Psi(sigma) devuelve un set lo expando. _idx = self.G2(t) if _idx > -1: V[_idx] += 1 elif card > 1: for tau in sigma: V = list(map(sum, zip(V, self.Gamma(tau)))) return V #Upsilon asigna a sigma la etiqueta con mayor cantidad de votos def Upsilon(self, sigma): V = self.Gamma(sigma) i = self.M(V) return self.G(i) # G es una funcion que dado un entero i devuelve la etiqueta # que ocupa la posicion i asumiento algun orden lexicografico sobre T def G(self, idx): if idx is None or idx >= len(self.dataset_handler.tags_set) or idx < 0: return None ''' Naive code: for _idx, t in enumerate(self.tags_set): if idx == _idx: return t But if we convert the set in a list we can index it and return ''' return list(self.dataset_handler.tags_set)[idx] def G2(self, tag): if tag not in self.dataset_handler.tags_position: return -1 return self.dataset_handler.tags_position[tag] # M es una función que dado un vector V ∈ R^{|T|} devuelve un entero 0 <= i <= |T|, # donde i es la posicion de la componente de V con valor máximo def M(self, vector): size = len(vector) if size < 1: return 0 major = vector[0] pos = 0 for idx, element in enumerate(vector): if major < element: pos = idx major = element del major return pos # I es una función que dado una condicion retorna 1 si es verdadera y cero en otro caso def I(self, condition): return 1 if condition else 0 def build_filtered_simplicial_complex(self): S = self.unify_dataset() # self.complex = gudhi.AlphaComplex(points=S) self.complex = gudhi.RipsComplex(points=S, max_edge_length=8.0) self.simplex_tree = self.complex.create_simplex_tree(max_dimension=3) # self.simplex_tree = self.complex.create_simplex_tree(max_alpha_square=2) # self.simplex_tree = self.complex.create_simplex_tree(max_dimension=3) # del self.complex # self.complex = None # self.simplex_tree.initialize_filtration() # diag = self.simplex_tree.persistence() # return diag def get_desired_persistence_interval2(self, choice=MAXIMAL): dimension = self.simplex_tree.dimension() print("\nDIMENSION := {0}\n".format(dimension)) dimension -= 1 pintervals = [] while len(pintervals) == 0 and dimension > -1: pintervals = self.simplex_tree.persistence_intervals_in_dimension( dimension) dimension -= 1 # get maximal persistence filtration if len(pintervals) == 0: return None intervals_count = len(pintervals) if choice == MAXIMAL: major = pintervals[0][1] - pintervals[0][0] desired_pos = 0 for idx, interv in enumerate(pintervals): i = interv[1] - interv[0] if major < i and not math.isinf(i): major = i desired_pos = idx print("el mayor es ", major) elif choice == RANDOMIZED: # get randomized persistence filtration desired_pos = random.randint(int(intervals_count / 2), intervals_count - 1) # to maximize posibilities # desired_pos = random.randint(int(intervals_count/2), intervals_count-1) # to maximize posibilities print("\nLa duracion de vida seleccionado aleatoriamente es {0}\n". format(pintervals[desired_pos][1] - pintervals[desired_pos][0])) else: # get average persistence filtration Avg = 0 for interv in pintervals: Avg += interv[1] - interv[0] Avg /= intervals_count desired_pos = 0 min_d = math.fabs((pintervals[0][1] - pintervals[0][0]) - Avg) for idx, interv in enumerate(pintervals): i = math.fabs((interv[1] - interv[0]) - Avg) if min_d > i and not math.isinf(i): min_d = i desired_pos = idx print("el intervalo de persistencia elegido es ", pintervals[desired_pos]) inter = pintervals[desired_pos] del pintervals return inter def get_desired_persistence_interval(self, choice=MAXIMAL): dimension = self.simplex_tree.dimension() print("\nDIMENSION := {0}\n".format(dimension)) dimension -= 1 pintervals = [] while len(pintervals) == 0 and dimension > -1: pintervals = self.simplex_tree.persistence_intervals_in_dimension( dimension) dimension -= 1 # get maximal persistence filtration if len(pintervals) == 0: return None major = pintervals[0][1] - pintervals[0][ 0] # compute the persistent-interval with maximal lifetime desired_pos = 0 for idx, interv in enumerate(pintervals): i = interv[1] - interv[0] if major < i and not math.isinf(i): major = i desired_pos = idx print("el mayor es ", major) if choice == MAXIMAL: return pintervals[desired_pos] else: high_lifetimes_pi = [ ] # We seek for all persistent-intervals which birth is greater than the birth of the maximal persistent interval max_pi = pintervals[desired_pos] lifetime = max_pi[1] - max_pi[0] for idx, interv in enumerate(pintervals): if interv[0] >= max_pi[0] and lifetime < (interv[1] - interv[0]) * 1.5: high_lifetimes_pi.append(interv) intervals_count = len(pintervals) init = 0 if len(high_lifetimes_pi) == 1: high_lifetimes_pi = pintervals init = int(intervals_count / 2) intervals_count = len(high_lifetimes_pi) if choice == RANDOMIZED: # get randomized persistence filtration desired_pos = random.randint(init, intervals_count - 1) # to maximize posibilities print( "\nLa duracion de vida seleccionado aleatoriamente es {0}\n" .format(high_lifetimes_pi[desired_pos][1] - high_lifetimes_pi[desired_pos][0])) return high_lifetimes_pi[desired_pos] else: # get average persistence filtration Avg = 0 for interv in high_lifetimes_pi: Avg += interv[1] - interv[0] if intervals_count > 0: Avg /= intervals_count else: return None desired_pos = 0 min_d = math.fabs( (high_lifetimes_pi[0][1] - high_lifetimes_pi[0][0]) - Avg ) # we get the first persistent-interval superior tu average for idx, interv in enumerate(pintervals): i = math.fabs((interv[1] - interv[0]) - Avg) if min_d > i and not math.isinf(i): min_d = i desired_pos = idx print("el intervalo de persistencia elegido es ", high_lifetimes_pi[desired_pos]) # return pintervals, desired_pos return high_lifetimes_pi[desired_pos] def execute(self): self.init_data() persistence_selector = { RANDOMIZED: "RANDOMIZED", MAXIMAL: "MAXIMAL", AVERAGE: "AVERAGE" } all_data = [] size_data = len(self.dataset_handler.dataset) for selector in persistence_selector: self.classifier_evaluator = ClassifierEvaluator( "TDABC_{0}".format(persistence_selector[selector]), classes=self.dataset_handler.tags_set) self.knn_classifier_evaluator = ClassifierEvaluator( "kNN_{0}".format(persistence_selector[selector]), classes=self.dataset_handler.tags_set) for k in [5, 10, 15, 20, 25]: print("\n#####################################") print("\n#####################################") print("\nEXECUTING REPEATED CROSS VALIDATION") folds = int((size_data + k - 1) / k) for j in range(folds): print("\nEXECUTE K-FOLD k={0}, n={1}".format(k, j)) self.split_dataset(k, j) diag = self.build_filtered_simplicial_complex( ) # to compute simplicial complex and filtrations print("persistence diagrams: ", diag) persistence_interval = self.get_desired_persistence_interval( choice=selector) if persistence_interval is None: # we ignore the process self.destroy() print( "we destroy all simlicial complex information because we couldnt find any persistent interval" ) continue self.simplex_tree.prune_above_filtration( persistence_interval[0]) predicted_values = [] real_values = [] elems = [] ttags = [ self.dataset_handler.tags_position[ self.dataset_handler.tags_training[i]] for i in self.dataset_handler.tags_training ] ttraining = [e for e in self.dataset_handler.training] for idx, x0 in self.dataset_handler.test: idx_key = str([idx]) value = self.Upsilon(idx) elems.append(x0) predicted_values.append(value) real_values.append( self.dataset_handler.tags_test[idx_key]) acc = accuracy_score(real_values, predicted_values) * 100 self.classifier_evaluator.add_metrics( real_values, predicted_values) self.destroy() knn = kNNClassifier(ttraining, ttags) in_values = knn.execute(elems) all_data.extend(elems) acc_knn = "None" if len(in_values) > 0: predicted_values2 = [self.G(i) for i in in_values] self.knn_classifier_evaluator.add_metrics( real_values, predicted_values2) acc_knn = accuracy_score(real_values, predicted_values2) * 100 print("\nTDABC accuracy = {0}".format(acc)) print("\nKNN accuracy = {0}".format(acc_knn)) self.classifier_evaluator.plot_all() self.knn_classifier_evaluator.plot_all() plt.show() def draw_simplex_tree(self): path = "./docs/SIMPLEX_TREES" file_name = time.strftime( "./docs/SIMPLEX_TREES/simplex_tree_%y.%m.%d__%H.%M.%S.txt") if not os.path.exists(path): os.makedirs(path) simplex_tree_file = open(file_name, "w") filtrations = self.simplex_tree.get_filtration() fmt = "%s:(%s):%.2f" points = self.unify_dataset() for filtered_value in filtrations: qsimplex = str(filtered_value[0]) filt = filtered_value[1] point = "" inner_simplex = qsimplex[1:-1] if inner_simplex.find(",") == -1: point = points[int(inner_simplex)] line = fmt % tuple((qsimplex, point, filt)) print(line) simplex_tree_file.write(str(line) + "\n")
class DatasetPlotter: def __init__(self, data_mgr=None): self.data = data_mgr \ if data_mgr is None: self.data = DatasetHandler(IRIS) self.data.load_dataset() def draw_data(self): if self.data.is_dataset(IRIS): self.draw_iris() elif self.data.is_dataset(SWISSROLL): self.draw_swiss_roll() else: self.draw_iris() def draw_iris(self): data_A_sample = self.data.unify_dataset() fig = plt.figure() fig.set_size_inches(10, 8) ax = fig.add_subplot(111) tag = None ks = list(self.data.tags_set) points = {ks[0]: [[], []]} points.update({ks[1]: [[], []]}) points.update({ks[2]: [[], []]}) for i in self.data.tags_training: idx = int(i[1:-1]) k = self.data.tags_training[i] points[k][0].append(data_A_sample[idx][0]) points[k][1].append(data_A_sample[idx][1]) for i in self.data.tags_test: idx = int(i[1:-1]) k = self.data.tags_test[i] points[k][0].append(data_A_sample[idx][0]) points[k][1].append(data_A_sample[idx][1]) area = (15)**2 for idx, c in enumerate(['r', 'b', 'g']): values = points[ks[idx]] l = self.data.labels[ks[idx]].strip() if l.find("setosa") != -1: l = "Setosa" elif l.find("versicolor") != -1: l = "Versicolor" elif l.find("virginica") != -1: l = "Virginica" ax.scatter(values[0], values[1], s=area, c=c, marker="o", label=l) ax.set_xlabel('Sepal length', size=15) ax.set_ylabel('Sepal width', size=15) ax.legend(fontsize=20) plt.savefig('DATA_GRAPHICS/iris.png') def draw_swiss_roll(self): fig = plt.figure() fig.set_size_inches(10, 8) ax = p3.Axes3D(fig) ax.view_init(7, -80) label = self.data.tags X = self.data.dataset for l in np.unique(label): ax.scatter(X[label == l, 0], X[label == l, 1], X[label == l, 2], color=plt.cm.jet(np.float(l) / np.max(label + 1)), s=20, edgecolor='k') plt.title('Swiss Roll') plt.savefig('DATA_GRAPHICS/swissroll.png') plt.show() def draw_hyperplanes(self, classifiers, names, scores): h = .02 # step size in the mesh #names = ["Nearest Neighbors", "TDA-Based Classifier (TDABC)"] figure = plt.figure(figsize=(27, 9)) i = 1 # iterate over datasets X = np.array(self.data.dataset) X_train = np.array(self.data.training) X_test = np.array(self.data.test) y_train = [ self.data.tags_position[self.data.tags_training[i]] for i in self.data.tags_training ] y_test = None if len(self.data.tags_test) > 0: y_test = [ self.data.tags_position[self.data.tags_test[i]] for i in self.data.tags_test ] x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5 y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) # just plot the dataset first cm = plt.cm.RdBu cm_bright = ListedColormap(['#FF0000', '#0000FF']) ax = plt.subplot(1, len(classifiers) + 1, i) ax.set_title("Input data") # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points if y_test is not None: ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6, edgecolors='k') ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) i += 1 # iterate over classifiers for name, clf, score in zip(names, classifiers, scores): ax = plt.subplot(1, len(classifiers) + 1, i) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. Z = np.array(clf) # Put the result into a color plot Z = Z.reshape(xx.shape) ax.contourf(xx, yy, Z, cmap=cm, alpha=.8) # Plot the training points ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright, edgecolors='k') # Plot the testing points ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, edgecolors='k', alpha=0.6) ax.set_xlim(xx.min(), xx.max()) ax.set_ylim(yy.min(), yy.max()) ax.set_xticks(()) ax.set_yticks(()) ax.set_title(name) if score is not None: ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'), size=15, horizontalalignment='right') plt.tight_layout() plt.show()
def __init__(self, data_mgr=None): self.data = data_mgr \ if data_mgr is None: self.data = DatasetHandler(IRIS) self.data.load_dataset()
n_file_item=items_per_file) res_storer.upload_and_store( base_dir, triplestore_url, base_iri, context_path, temp_dir_for_rdf_loading) prov_storer = Storer(prov, context_map={context_path: context_file_path}, dir_split=dir_split_number, n_file_item=items_per_file) prov_storer.store_all( base_dir, base_iri, context_path, temp_dir_for_rdf_loading) dset_handler = DatasetHandler(triplestore_url_real, context_path, context_file_path, base_iri, base_dir, info_dir, dataset_home, temp_dir_for_rdf_loading) dset_handler.update_dataset_info(result) # If everything went fine, move the input file to the done directory move_file(cur_file_path, reference_dir_done + os.sep + cur_local_dir_path) # If something in the process went wrong, move the input file # in an appropriate directory else: if crp.reperr.is_empty(): # The resource has been already processed move_file(cur_file_path, reference_dir_done + os.sep + cur_local_dir_path) else:
context_path, temp_dir_for_rdf_loading) prov_storer = Storer( prov, context_map={ context_path: context_file_path }, dir_split=dir_split_number, n_file_item=items_per_file) prov_storer.store_all( base_dir, base_iri, context_path, temp_dir_for_rdf_loading) dset_handler = DatasetHandler( triplestore_url_real, context_path, context_file_path, base_iri, base_dir, info_dir, dataset_home, temp_dir_for_rdf_loading) dset_handler.update_dataset_info(result) # If everything went fine, move the input file to the done directory move_file( cur_file_path, reference_dir_done + os.sep + cur_local_dir_path) # If something in the process went wrong, move the input file # in an appropriate directory else: if crp.reperr.is_empty( ): # The resource has been already processed move_file(
def dataset_generator(): rospy.init_node('dataset_generator_node') moveit_handler = MoveItHandler() ring_handler = RingHandler() rospy.Subscriber("/vrep_ros_interface/image", Image, image_callback) rospy.Subscriber("/ring_current_position", Pose, ring_handler.update_ring_pose) pub_ring = rospy.Publisher(conf.get('Ring', 'PositionTopic'), Pose, queue_size=1) pub_joint_controller = rospy.Publisher('/dagger/joint_states', JointState, queue_size=1) pub_delta_controller = rospy.Publisher('/dagger/delta_pose', PoseStamped, queue_size=1) rospy.sleep(3) init_ring = True init_panda = True for i in range(50): print "Iteration: ", i dataset_handler = DatasetHandler(i) while not rospy.is_shutdown(): if init_panda: print "moving to ready position" pub_joint_controller.publish( moveit_handler.target_joint_states) init_panda = False moveit_handler.wait(moveit_handler.target_joint_states) continue if init_ring: print "setting ring to random pose" ring_handler.set_random_valid_pose() ring_pose = ring_handler.get_ring_pose() pub_ring.publish(ring_pose) # delta between vrep and rviz on x of 0.5!! ring_handler.ring_coordinate.x += 0.5 init_ring = False rospy.sleep(1) continue if moveit_handler.get_step_size( ring_handler.ring_coordinate) < conf.getfloat( 'Goal', 'MinStep'): rospy.sleep(3) init_ring = True init_panda = True dataset_handler.save() break moveit_handler.compute_master_policy(ring_handler) dataset_handler.append((LAST_IMAGE, [ moveit_handler.delta_pose.pose.position.x, moveit_handler.delta_pose.pose.position.y, moveit_handler.delta_pose.pose.position.z ])) pub_delta_controller.publish(moveit_handler.delta_pose) moveit_handler.update_target_pose() moveit_handler.wait(moveit_handler.target_pose)
print("Prediction and error calculation of SVD with 90% energy took " + str(time.time() - next_part) + " secs") print("\n\nOverall process: " + str(time.time() - start) + " secs") def error(self, A, test_ratings): """ Computes the error of the input ratings vs predicted values from model. Args: ratings (np.ndarray): An array of <user_id, item_id, true_rating> tuples Returns: The Root Mean Square Error and Mean Absolute Error values. """ sq_err, abs_err = 0, 0 for user_id, item_id, rating in test_ratings: predicted = A[user_id - 1][item_id - 1] diff = predicted - rating abs_err += abs(diff) sq_err += diff * diff rmse = np.sqrt(sq_err / len(test_ratings)) mae = abs_err / len(test_ratings) return rmse, mae if __name__ == "__main__": s = SVD() dh = DatasetHandler() s.predict_and_find_error(dh.test_ratings.values)
for user_id, item_id, rating in test: predicted = A[user_id - 1][item_id - 1] diff = abs(predicted) - rating abs_err += abs(diff) sq_err += diff * diff rmse = np.sqrt(sq_err / len(test)) mae = abs_err / len(test) return rmse, mae if __name__ == "__main__": t = CUR() path1 = "data/test_ratings.csv" M = UtilityMatrix().utility_mat.values Test_Set = DatasetHandler().test_ratings.values for i in range(M.shape[0]): sum = 0 count = 0 for j in range(M.shape[1]): if not math.isnan(M[i][j]): sum += M[i][j] count += 1 for j in range(M.shape[1]): if math.isnan(M[i][j]): M[i][j] = sum / count C, U, R = t.mycur(M, 1000) A = np.dot(C, np.dot(U, R))