def histogram(): """ Draw plot that shows ratio of number of users and number of conferences that they visited. """ members, _ = get_members_and_conferences() cnt = Counter([i.__len__() for i in members.values()]) del _ del members gc.collect() plt.plot(list(cnt.keys()), list(cnt.values())) plt.xlabel('Number of conferences') plt.ylabel(u'Number of users') plt.axis([0, 60, 0, 50000])
def prepare_similarity_matrix(): """ Compute similarity matrix for collaborative filtering model. Weight is cosine between the vectors. Weight will be computed only if two conferences include common users, otherwise weight = 0. """ m = get_sparse_matrix() members, conferences = get_members_and_conferences() num_to_member, \ member_to_num, \ num_to_conference, \ conference_to_num, \ _, \ _ = get_mapping() del _ len = m.shape[0] m = m.tocsr() print(len) res = np.zeros((len,len)) squares = [0 for i in range(len)] users = [[] for i in range(len)] users_confs = [[] for i in range(m.shape[1])] print('Preparing...') cnt = 0 for i in range(len): if (cnt * 100 / len) % 10 == 0: print('{}%'.format(cnt * 100 / len)) cnt += 1 users[i] = list(map(lambda x: member_to_num[x], conferences[num_to_conference[i]])) squares[i] = sqrt(users[i].__len__()) len2 = m.shape[1] cnt = 0 for i in range(len2): if cnt % 50000 == 0: print('{}%'.format(cnt * 100 / len2)) cnt += 1 users_confs[i] = list(map(lambda x: conference_to_num[x], members[num_to_member[i]])) print('------\nMain part\n------') cnt = 0 start = mktime(localtime()) matrix_file = open('output/similarity_triples.txt', 'a') m = m.tocsc() for i in range(len): if cnt % 20 == 0: print('---\n{}%'.format(round(cnt * 100 / len, 3))) cnt += 1 for u in users[i]: for uc in users_confs[u]: if res[i, uc] == 0: if users[uc].__len__() < users[i].__len__(): tmp = m[i, users[uc]].nonzero()[1].__len__() else: tmp = m[uc, users[i]].nonzero()[1].__len__() tmp /= (squares[uc] * squares[i]) res[i, uc] = tmp res[uc, i] = tmp matrix_file.write('{},{},{}\n'.format(i, uc, tmp)) matrix_file.close() end = mktime(localtime()) time = end - start print('---------/n TIME = {} /n----------'.format(time))