示例#1
0
def user_removal_based_on_participation():

    keep_users_p = 'num_files/keep_users.pkl'
    if os.path.exists(keep_users_p):
        #print("\tUser participation extraction exists", end='\r')
        keep_users = unpickle_object(keep_users_p)
        #print("\t[END] User participation extraction finished [%d Users to keep]" % (len(keep_users)))
        return keep_users

    lst = read_csv_list('num_files/user_to_num.csv')[1:]
    #print("\t[-] User participation detection.", end='\r')
    users = [i[0] for i in lst]
    # Number of posts per user
    x = np.array([int(x[1]) for x in lst])
    # Characters per user post
    y = [np.array([int(y) for y in x[2:]]) for x in lst]

    # Average characters per post of a user
    z = np.array([i.mean() for i in y if len(i) > 0])
    keep_users = set()
    limi = np.quantile(x, .50)
    limk = np.quantile(z, .50)
    for user, i, k in zip(users, x, z):
        if i > limi or k > limk:
            keep_users.add(user)

    pickle_object(keep_users, keep_users_p)
    #print('[END] Extracted all the user participations [%d]' % (len(keep_users)))
    return keep_users
示例#2
0
    def join_all_results(self):
        join_dict_p = "join_dict.pkl"

        join_dict = dict()
        for _id in self.ids:
            filename = _id + "_files/results.pkl"
            results = unpickle_object(filename)
            self.total[_id] = len(results)
            self.pprint("[%s] Total results: %d" % (_id, self.total[_id]))
            for res in results:

                u1_u2_tuple = res[:2]
                if not u1_u2_tuple in join_dict:
                    join_dict[u1_u2_tuple] = dict()
                #join_dict[u1_u2_tuple][_id] = res[:2]

        if self.backup and os.path.exists(join_dict_p):
            join_dict = unpickle_object(join_dict_p)
            return join_dict

        for _id in self.ids:
            join_dict = self.fill_results_for_id(_id, join_dict)

        pickle_object(join_dict, join_dict_p)
        return join_dict
示例#3
0
    def get_information_from_matrix(self, user_ind, sparse_matrix_dot):
        tic = time.time()
        lst_res = []
        inv_user_ind = {v: k for k, v in user_ind.items()}
        num_users = len(user_ind)

        #self.pprint("Transforming Matrix A", end='\r')
        #sparse_matrix_dot = sparse_matrix_dot.tocoo()
        #row, col, data = sparse_matrix_dot.row, sparse_matrix_dot.col, sparse_matrix_dot.data
        #self.pprint("[END] Transforming Matrix A")
        lst_res = []
        tx = sparse_matrix_dot.shape[0]
        print(sparse_matrix_dot.shape)
        for uind in range(tx):
            if uind % 100 == 0:
                self.pprint("Info Extraction",
                            "[%d Users Processed]" % (uind),
                            "[%d List Length]" % (len(lst_res)),
                            "[%0.3f Percentage]" % ((uind / tx) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            row = np.array(sparse_matrix_dot[uind].toarray())
            row = row.flatten()
            row[uind] = 0  # We do not consider the comparison with itself
            rmax = row.max()
            if rmax > 0:
                n = (row == rmax).sum()
                #max_inds = row.argsort()[-n:][::-1]
                max_uinds = np.argpartition(
                    row, -n
                )[-n:]  # it orders "-n" elements of the row, and then, it extracts the last n.

                for i in max_uinds:
                    lst_res.append((inv_user_ind[uind], inv_user_ind[i], rmax))

        lst_res = sorted(lst_res, key=lambda x: x[2], reverse=True)
        pickle_object(lst_res, self.dir + "results.pkl")
        self.pprint("[END] Info Extraction", "[%d Users Processed]" % (uind),
                    "[%d List Length]" % (len(lst_res)),
                    "[%0.3f Percentage]" % ((uind / tx) * 100), get_ram(),
                    get_elapsed_time(tic))

        gen_csv_from_tuples(self.dir + "results.csv",
                            ["User1", "User2", "Relation Value"], lst_res)

        return lst_res
示例#4
0
 def compute_matrix_mult(self, sparse_matrix):
     tic = time.time()
     sparse_matrix_dot_p = self.dir + 'sparse_matrix_dot_a.pkl'
     print(sparse_matrix.shape)
     self.pprint("Executing coincidence computation over matrix",
                 get_ram(),
                 get_elapsed_time(tic),
                 end='\r')
     sparse_matrix_dot = sparse_matrix.dot(sparse_matrix.T)
     #sparse_matrix_dot = sparse_matrix_dot.astype(dtype='int32')
     sparse_matrix_dot = sparse.triu(sparse_matrix_dot, format='csr')
     print(sparse_matrix_dot.shape)
     sparse_matrix_dot.eliminate_zeros()
     pickle_object(sparse_matrix_dot, sparse_matrix_dot_p)
     self.pprint("[END] Executing coincidence computation over matrix",
                 get_ram(), get_elapsed_time(tic))
     print(sparse_matrix_dot.shape)
     return sparse_matrix_dot
示例#5
0
    def gen_sparse_matrix(self, dictio_of_users, dictio_of_usage, num_values):
        tic = time.time()
        sparse_matrix_p = self.dir + 'sparse_matrix.pkl'

        #Adding files to list for cleanup
        self.cleanup_list.append(sparse_matrix_p)

        if self.backup and os.path.exists(sparse_matrix_p):
            self.pprint("Sparse Matrix already exist, unpickling.", end='\r')
            sparse_matrix = unpickle_object(sparse_matrix_p)
            self.pprint("[END] Sparse Matrix already exist, unpickling.",
                        get_ram(), get_elapsed_time(tic))
            return sparse_matrix

        num_users = len(dictio_of_users)
        rows = []
        cols = []
        data = []
        for ind, row in enumerate(dictio_of_users.items()):
            if ind % 1000 == 0:
                self.pprint("Sparse Matrix Generation",
                            "[%d Users Processed]" % (ind),
                            "[%0.3f Percentage]" % ((ind / num_users) * 100),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
            uind, values = row[0], row[1]
            usages = dictio_of_usage[uind]
            for value, usage in zip(values, usages):
                rows.append(uind)
                cols.append(value)
                data.append(usage)

        sparse_matrix = csr_matrix((data, (rows, cols)),
                                   shape=(num_users, num_values),
                                   dtype=self.dtype)
        self.pprint("[END] Sparse Matrix Generation",
                    "[%d Users Processed]" % (ind),
                    "[%0.3f Percentage]" % ((ind / num_users) * 100),
                    get_ram(), get_elapsed_time(tic))
        pickle_object(sparse_matrix, sparse_matrix_p)
        return sparse_matrix
示例#6
0
    def gen_matrix(self, join_dict):

        matrix_p = "matrix.pkl"
        self.pprint("Matrix generation", end='\r')
        if self.backup and os.path.exists(matrix_p):
            self.pprint("[END] Matrix generation, already existed")
            matrix = unpickle_object(matrix_p)
            return matrix

        num_pairs = len(join_dict)
        num_features = len(self.ids)
        matrix = np.zeros((num_pairs, num_features), dtype=np.uint32)
        for i, (pair, feature_dict) in enumerate(join_dict.items()):
            for j, (feature) in enumerate(self.ids):
                if feature in feature_dict:
                    matrix[i][j] = feature_dict[feature]

        self.pprint("[END] Matrix generation")
        pickle_object(matrix, matrix_p)
        return matrix
示例#7
0
    def clean_matrix(self, sparse_matrix, user_ind, value_ind, dictio_of_users,
                     dictio_of_values):
        tic = time.time()

        #inv_user_ind = {v: k for k, v in user_ind.items()}
        #inv_value_ind = {v: k for k, v in value_ind.items()}

        sparse_matrix_p = self.dir + 'clean_sparse_matrix.pkl'
        user_ind_p = self.dir + 'clean_user_ind.pkl'
        value_ind_p = self.dir + 'clean_value_ind.pkl'
        dictio_of_users_p = self.dir + 'clean_dictio_of_users.pkl'
        dictio_of_values_p = self.dir + 'clean_dictio_of_values.pkl'

        #Adding files to list for cleanup
        self.cleanup_list.append(sparse_matrix_p), self.cleanup_list.append(
            user_ind_p
        ), self.cleanup_list.append(value_ind_p), self.cleanup_list.append(
            dictio_of_users_p), self.cleanup_list.append(dictio_of_values_p)

        if self.backup and os.path.exists(sparse_matrix_p) and os.path.exists(
                user_ind_p) and os.path.exists(value_ind_p) and os.path.exists(
                    dictio_of_users_p) and os.path.exists(dictio_of_values_p):
            self.pprint("Clean data already exist, unpickling.", end='\r')
            user_ind = unpickle_object(user_ind_p)
            value_ind = unpickle_object(value_ind_p)
            dictio_of_users = unpickle_object(dictio_of_users_p)
            dictio_of_values = unpickle_object(dictio_of_values_p)
            sparse_matrix = unpickle_object(sparse_matrix_p)
            self.pprint("[END] Clean data already exist, unpickling.",
                        get_ram(), get_elapsed_time(tic))
            return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values

        user_set = set(dictio_of_users.keys())
        self.pprint("Taking values that appear once.",
                    get_ram(),
                    get_elapsed_time(tic),
                    end='\r')
        value_set = set([k for k, v in dictio_of_values.items() if len(v) > 1])
        self.pprint(
            "[END] Taking values that appear once",
            "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" %
            (len(value_set), len(user_set), len(value_set)), get_ram(),
            get_elapsed_time(tic))

        # We execute all user removal procedures specified
        if not self.user_removal is None:
            for ind, procedure in enumerate(self.user_removal):
                self.pprint("Executing user removal procedure [%d] " % (ind),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
                user_list = procedure(user_ind, value_ind, dictio_of_users,
                                      dictio_of_values)
                user_set = user_set.intersection(set(user_list))
                self.pprint(
                    "[END] Executing user removal procedure [%d]" % (ind + 1),
                    "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" %
                    (len(user_list), len(user_set), len(value_set)), get_ram(),
                    get_elapsed_time(tic))

        # We execute all value removal procedures specified by the user
        if not self.value_removal is None:
            for ind, procedure in enumerate(self.value_removal):
                self.pprint("Executing value removal procedure [%d]" % (ind),
                            get_ram(),
                            get_elapsed_time(tic),
                            end='\r')
                value_list = procedure(user_ind, value_ind, dictio_of_users,
                                       dictio_of_values)
                value_set = value_set.intersection(set(value_list))
                self.pprint(
                    "[END] Executing value removal procedure [%d]" % (ind + 1),
                    "[Process Remaining: %d] [User Set: %d] [Value Set: %d]" %
                    (len(value_list), len(user_set), len(value_set)),
                    get_ram(), get_elapsed_time(tic))

        sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set(
            sparse_matrix, user_ind, value_ind, dictio_of_users,
            dictio_of_values, user_set, value_set)

        self.pprint("Obtaining empty data", end='\r')
        user_set = set(dictio_of_users.keys())
        value_set = set(dictio_of_values.keys())
        user_set_rem = set([
            uind for uind, vinds in dictio_of_users.items() if len(vinds) == 0
        ])
        value_set_rem = set([
            vind for vind, uinds in dictio_of_values.items() if len(uinds) == 0
        ])
        user_set = user_set.difference(user_set_rem)
        value_set = value_set.difference(value_set_rem)
        self.pprint("[END] Obtaining empty data")
        sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values = self.remove_user_value_set(
            sparse_matrix, user_ind, value_ind, dictio_of_users,
            dictio_of_values, user_set, value_set)

        pickle_object(sparse_matrix, sparse_matrix_p)
        pickle_object(user_ind, user_ind_p)
        pickle_object(value_ind, value_ind_p)
        pickle_object(dictio_of_users, dictio_of_users_p)
        pickle_object(dictio_of_values, dictio_of_values_p)
        return sparse_matrix, user_ind, value_ind, dictio_of_users, dictio_of_values