def user_removal_based_on_participation(): keep_users_p = 'num_files/keep_users.pkl' if os.path.exists(keep_users_p): #print("\tUser participation extraction exists", end='\r') keep_users = unpickle_object(keep_users_p) #print("\t[END] User participation extraction finished [%d Users to keep]" % (len(keep_users))) return keep_users lst = read_csv_list('num_files/user_to_num.csv')[1:] #print("\t[-] User participation detection.", end='\r') users = [i[0] for i in lst] # Number of posts per user x = np.array([int(x[1]) for x in lst]) # Characters per user post y = [np.array([int(y) for y in x[2:]]) for x in lst] # Average characters per post of a user z = np.array([i.mean() for i in y if len(i) > 0]) keep_users = set() limi = np.quantile(x, .50) limk = np.quantile(z, .50) for user, i, k in zip(users, x, z): if i > limi or k > limk: keep_users.add(user) pickle_object(keep_users, keep_users_p) #print('[END] Extracted all the user participations [%d]' % (len(keep_users))) return keep_users
def join_all_results(self): join_dict_p = "join_dict.pkl" join_dict = dict() for _id in self.ids: filename = _id + "_files/results.pkl" results = unpickle_object(filename) self.total[_id] = len(results) self.pprint("[%s] Total results: %d" % (_id, self.total[_id])) for res in results: u1_u2_tuple = res[:2] if not u1_u2_tuple in join_dict: join_dict[u1_u2_tuple] = dict() #join_dict[u1_u2_tuple][_id] = res[:2] if self.backup and os.path.exists(join_dict_p): join_dict = unpickle_object(join_dict_p) return join_dict for _id in self.ids: join_dict = self.fill_results_for_id(_id, join_dict) pickle_object(join_dict, join_dict_p) return join_dict
def get_information_from_matrix(self, user_ind, sparse_matrix_dot): tic = time.time() lst_res = [] inv_user_ind = {v: k for k, v in user_ind.items()} num_users = len(user_ind) #self.pprint("Transforming Matrix A", end='\r') #sparse_matrix_dot = sparse_matrix_dot.tocoo() #row, col, data = sparse_matrix_dot.row, sparse_matrix_dot.col, sparse_matrix_dot.data #self.pprint("[END] Transforming Matrix A") lst_res = [] tx = sparse_matrix_dot.shape[0] print(sparse_matrix_dot.shape) for uind in range(tx): if uind % 100 == 0: self.pprint("Info Extraction", "[%d Users Processed]" % (uind), "[%d List Length]" % (len(lst_res)), "[%0.3f Percentage]" % ((uind / tx) * 100), get_ram(), get_elapsed_time(tic), end='\r') row = np.array(sparse_matrix_dot[uind].toarray()) row = row.flatten() row[uind] = 0 # We do not consider the comparison with itself rmax = row.max() if rmax > 0: n = (row == rmax).sum() #max_inds = row.argsort()[-n:][::-1] max_uinds = np.argpartition( row, -n )[-n:] # it orders "-n" elements of the row, and then, it extracts the last n. for i in max_uinds: lst_res.append((inv_user_ind[uind], inv_user_ind[i], rmax)) lst_res = sorted(lst_res, key=lambda x: x[2], reverse=True) pickle_object(lst_res, self.dir + "results.pkl") self.pprint("[END] Info Extraction", "[%d Users Processed]" % (uind), "[%d List Length]" % (len(lst_res)), "[%0.3f Percentage]" % ((uind / tx) * 100), get_ram(), get_elapsed_time(tic)) gen_csv_from_tuples(self.dir + "results.csv", ["User1", "User2", "Relation Value"], lst_res) return lst_res
def compute_matrix_mult(self, sparse_matrix): tic = time.time() sparse_matrix_dot_p = self.dir + 'sparse_matrix_dot_a.pkl' print(sparse_matrix.shape) self.pprint("Executing coincidence computation over matrix", get_ram(), get_elapsed_time(tic), end='\r') sparse_matrix_dot = sparse_matrix.dot(sparse_matrix.T) #sparse_matrix_dot = sparse_matrix_dot.astype(dtype='int32') sparse_matrix_dot = sparse.triu(sparse_matrix_dot, format='csr') print(sparse_matrix_dot.shape) sparse_matrix_dot.eliminate_zeros() pickle_object(sparse_matrix_dot, sparse_matrix_dot_p) self.pprint("[END] Executing coincidence computation over matrix", get_ram(), get_elapsed_time(tic)) print(sparse_matrix_dot.shape) return sparse_matrix_dot
def gen_sparse_matrix(self, dictio_of_users, dictio_of_usage, num_values): tic = time.time() sparse_matrix_p = self.dir + 'sparse_matrix.pkl' #Adding files to list for cleanup self.cleanup_list.append(sparse_matrix_p) if self.backup and os.path.exists(sparse_matrix_p): self.pprint("Sparse Matrix already exist, unpickling.", end='\r') sparse_matrix = unpickle_object(sparse_matrix_p) self.pprint("[END] Sparse Matrix already exist, unpickling.", get_ram(), get_elapsed_time(tic)) return sparse_matrix num_users = len(dictio_of_users) rows = [] cols = [] data = [] for ind, row in enumerate(dictio_of_users.items()): if ind % 1000 == 0: self.pprint("Sparse Matrix Generation", "[%d Users Processed]" % (ind), "[%0.3f Percentage]" % ((ind / num_users) * 100), get_ram(), get_elapsed_time(tic), end='\r') uind, values = row[0], row[1] usages = dictio_of_usage[uind] for value, usage in zip(values, usages): rows.append(uind) cols.append(value) data.append(usage) sparse_matrix = csr_matrix((data, (rows, cols)), shape=(num_users, num_values), dtype=self.dtype) self.pprint("[END] Sparse Matrix Generation", "[%d Users Processed]" % (ind), "[%0.3f Percentage]" % ((ind / num_users) * 100), get_ram(), get_elapsed_time(tic)) pickle_object(sparse_matrix, sparse_matrix_p) return sparse_matrix
def gen_matrix(self, join_dict): matrix_p = "matrix.pkl" self.pprint("Matrix generation", end='\r') if self.backup and os.path.exists(matrix_p): self.pprint("[END] Matrix generation, already existed") matrix = unpickle_object(matrix_p) return matrix num_pairs = len(join_dict) num_features = len(self.ids) matrix = np.zeros((num_pairs, num_features), dtype=np.uint32) for i, (pair, feature_dict) in enumerate(join_dict.items()): for j, (feature) in enumerate(self.ids): if feature in feature_dict: matrix[i][j] = feature_dict[feature] self.pprint("[END] Matrix generation") pickle_object(matrix, matrix_p) return matrix
def clean_matrix(self, sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values): tic = time.time() #inv_user_ind = {v: k for k, v in user_ind.items()} #inv_value_ind = {v: k for k, v in value_ind.items()} sparse_matrix_p = self.dir + 'clean_sparse_matrix.pkl' user_ind_p = self.dir + 'clean_user_ind.pkl' value_ind_p = self.dir + 'clean_value_ind.pkl' dictio_of_users_p = self.dir + 'clean_dictio_of_users.pkl' dictio_of_values_p = self.dir + 'clean_dictio_of_values.pkl' #Adding files to list for cleanup self.cleanup_list.append(sparse_matrix_p), self.cleanup_list.append( user_ind_p ), self.cleanup_list.append(value_ind_p), self.cleanup_list.append( dictio_of_users_p), self.cleanup_list.append(dictio_of_values_p) if self.backup and os.path.exists(sparse_matrix_p) and os.path.exists( user_ind_p) and os.path.exists(value_ind_p) and os.path.exists( dictio_of_users_p) and os.path.exists(dictio_of_values_p): self.pprint("Clean data already exist, unpickling.", end='\r') user_ind = unpickle_object(user_ind_p) value_ind = unpickle_object(value_ind_p) dictio_of_users = unpickle_object(dictio_of_users_p) dictio_of_values = unpickle_object(dictio_of_values_p) sparse_matrix = unpickle_object(sparse_matrix_p) self.pprint("[END] Clean data already exist, unpickling.", get_ram(), get_elapsed_time(tic)) return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values user_set = set(dictio_of_users.keys()) self.pprint("Taking values that appear once.", get_ram(), get_elapsed_time(tic), end='\r') value_set = set([k for k, v in dictio_of_values.items() if len(v) > 1]) self.pprint( "[END] Taking values that appear once", "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" % (len(value_set), len(user_set), len(value_set)), get_ram(), get_elapsed_time(tic)) # We execute all user removal procedures specified if not self.user_removal is None: for ind, procedure in enumerate(self.user_removal): self.pprint("Executing user removal procedure [%d] " % (ind), get_ram(), get_elapsed_time(tic), end='\r') user_list = procedure(user_ind, value_ind, dictio_of_users, dictio_of_values) user_set = user_set.intersection(set(user_list)) self.pprint( "[END] Executing user removal procedure [%d]" % (ind + 1), "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" % (len(user_list), len(user_set), len(value_set)), get_ram(), get_elapsed_time(tic)) # We execute all value removal procedures specified by the user if not self.value_removal is None: for ind, procedure in enumerate(self.value_removal): self.pprint("Executing value removal procedure [%d]" % (ind), get_ram(), get_elapsed_time(tic), end='\r') value_list = procedure(user_ind, value_ind, dictio_of_users, dictio_of_values) value_set = value_set.intersection(set(value_list)) self.pprint( "[END] Executing value removal procedure [%d]" % (ind + 1), "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" % (len(value_list), len(user_set), len(value_set)), get_ram(), get_elapsed_time(tic)) sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set( sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values, user_set, value_set) self.pprint("Obtaining empty data", end='\r') user_set = set(dictio_of_users.keys()) value_set = set(dictio_of_values.keys()) user_set_rem = set([ uind for uind, vinds in dictio_of_users.items() if len(vinds) == 0 ]) value_set_rem = set([ vind for vind, uinds in dictio_of_values.items() if len(uinds) == 0 ]) user_set = user_set.difference(user_set_rem) value_set = value_set.difference(value_set_rem) self.pprint("[END] Obtaining empty data") sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set( sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values, user_set, value_set) pickle_object(sparse_matrix, sparse_matrix_p) pickle_object(user_ind, user_ind_p) pickle_object(value_ind, value_ind_p) pickle_object(dictio_of_users, dictio_of_users_p) pickle_object(dictio_of_values, dictio_of_values_p) return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values