class Experiment: def __init__(self): self.dbAdapter = DBAdapter(StaticVariable.ROOTPATH + "TwitterData.sqlite") self.tool = MyTools() # Get valid user list self.validUsers = self.dbAdapter.getValidUserList() print("Valid users: " + str(len(self.validUsers))) # Add index on valid users self.indU = {} # UserID : index USERS = 0 for user in self.validUsers: self.indU[user] = USERS USERS += 1 self.getSeedUsers() self.getFriendList() self.filterSmallEgoNetwork() self.getLikeVectors() self.getAuthorshipOnLikedTweets() self.getLikeCount() self.getMentionCount() self.getMutualFriendsCount() def getSeedUsers(self): # Path to save dump file FILE_EGOUSERS = StaticVariable.ROOTPATH + "egousers.pickle" # Get seed user list if os.path.exists(FILE_EGOUSERS) == True: file_egousers = open(FILE_EGOUSERS, "rb") self.egousers = pickle.load(file_egousers) file_egousers.close() else: self.egousers = [] # INTEGER ARRAY seedCandidates = self.dbAdapter.getSeedUserList() for seed in seedCandidates: if seed not in self.validUsers: continue if seed not in self.egousers: self.egousers.append(seed) file_egousers = open(FILE_EGOUSERS, "wb") pickle.dump(self.egousers, file_egousers) file_egousers.close() def getFriendList(self): # Path to save dump file FILE_FRIENDLIST = StaticVariable.ROOTPATH + "friendlist.pickle" # Get FriendList print("Getting friend list...") if os.path.exists(FILE_FRIENDLIST) == True: file_friendlist = open(FILE_FRIENDLIST, "rb") self.friendList = pickle.load(file_friendlist) file_friendlist.close() else: self.friendList = {} # KEY: INTEGER / VALUES INTEGER ARRAY for validUser in self.validUsers: tmpFriendList = self.dbAdapter.getFriendship(validUser) validFriends = [] for friend in tmpFriendList: if friend in self.validUsers and friend not in validFriends: validFriends.append(friend) self.friendList[validUser] = validFriends file_friendlist = open(FILE_FRIENDLIST, "wb") pickle.dump(self.friendList, file_friendlist) file_friendlist.close() def filterSmallEgoNetwork(self): # Filter small ego network print("Filtering ego users...") print("\t" + str(len(self.egousers)) + " users => ", end="") tmp = [] for egouser in self.egousers: if len(self.friendList[egouser]) > 100: tmp.append(egouser) self.egousers = tmp print(str(len(self.egousers)) + " users") allUsers = {} for egouser in self.egousers: allUsers[egouser] = None for friend in self.friendList[egouser]: allUsers[friend] = None print("\t" + str(len(allUsers)) + " users are being in " + str(len(self.egousers)) + " ego networks") print() # Average number of members for each ego network nFriends = 0 for egouser in self.egousers: nFriends += len(self.friendList[egouser]) + 1 print("Average number of network members: " + str(nFriends / len(self.egousers))) def getTopicVectors(self): # Path to save dump file FILE_TOPICVECTORS = StaticVariable.ROOTPATH + "topic_vectors.pickle" # Gibbs sampling setting NTOPICS = 30 GIBBS_SAMPLES = 100 BURNIN_POINT = 50 SAMPLING_INTERVAL = 2 print("Getting topic vectors...") if os.path.exists(FILE_TOPICVECTORS): file_topic_vectors = open(FILE_TOPICVECTORS, "rb") self.topic_vectors = pickle.load(file_topic_vectors) file_topic_vectors.close() else: self.topic_vectors = {} userlist = dict() for egouser in self.egousers: if egouser not in userlist: userlist.append(egouser) for friend in self.friendList[egouser]: if friend not in userlist: userlist.append(friend) sampler = LDASampler(NTOPICS, userlist) sampler.run(GIBBS_SAMPLES, BURNIN_POINT, SAMPLING_INTERVAL) self.topic_vectors = sampler.getTopicVectors() file_topic_vectors = open(FILE_TOPICVECTORS, "wb") pickle.dump(self.topic_vectors, file_topic_vectors) file_topic_vectors.close() print("\tCalculated topic vectors of " + str(len(self.topic_vectors.keys())) + " users") def getLikeVectors(self): # Path to save dump file FILE_LIKEVECTORS = StaticVariable.ROOTPATH + "like_vectors.pickle" print("Getting like vectors...") if os.path.exists(FILE_LIKEVECTORS) == True: file_like_vectors = open(FILE_LIKEVECTORS, "rb") self.like_vectors = pickle.load(file_like_vectors) file_like_vectors.close() else: self.like_vectors = {} # {user: [tweet, ...], ...} userlist = [] for egouser in self.egousers: if egouser not in userlist: userlist.append(egouser) for friend in self.friendList[egouser]: if friend not in userlist: userlist.append(friend) for user in userlist: self.like_vectors[user] = self.dbAdapter.getLikingTweets(user) file_like_vectors = open(FILE_LIKEVECTORS, "wb") pickle.dump(self.like_vectors, file_like_vectors) file_like_vectors.close() print("\tCalculated Like vectors of " + str(len(self.like_vectors.keys())) + " users") def getAuthorshipOnLikedTweets(self): # Path to save dump file FILE_AUTHORSHIP_ON_LIKEDTWEET = StaticVariable.ROOTPATH + "authorship_on_likedtweets.pickle" print("Getting authorship on liked tweets...") if os.path.exists(FILE_AUTHORSHIP_ON_LIKEDTWEET) == True: file_authorship_on_likedtweets = open(FILE_AUTHORSHIP_ON_LIKEDTWEET, "rb") self.authorship_on_likedtweets = pickle.load(file_authorship_on_likedtweets) file_authorship_on_likedtweets.close() else: self.authorship_on_likedtweets = {} # {egouser: {member: [tweet, ...], ...}, ...} for egouser in self.egousers: membersInEgoNetwork = {} membersInEgoNetwork[egouser] = [] for friend in self.friendList[egouser]: membersInEgoNetwork[friend] = [] # Find liked tweets in ego network likedTweets = {} for member in membersInEgoNetwork: for tweet in self.like_vectors[member]: likedTweets[tweet] = None # Find tweet list for each member of ego network tweetList = self.dbAdapter.getTweetListByAuthor(membersInEgoNetwork) for member in membersInEgoNetwork: for tweet in tweetList[member]: if tweet in likedTweets: membersInEgoNetwork[member].append(tweet) self.authorship_on_likedtweets[egouser] = membersInEgoNetwork file_authorship_on_likedtweets = open(FILE_AUTHORSHIP_ON_LIKEDTWEET, "wb") pickle.dump(self.authorship_on_likedtweets, file_authorship_on_likedtweets) file_authorship_on_likedtweets.close() print("\tFound authorship on liked tweets for each ego network") def getMentionCount(self): # Path to save dump file FILE_MENTIONCOUNT = StaticVariable.ROOTPATH + "mention_count.pickle" print("Getting mention count...") if os.path.exists(FILE_MENTIONCOUNT) == True: file_mentioncount = open(FILE_MENTIONCOUNT, "rb") self.mention_count = pickle.load(file_mentioncount) file_mentioncount.close() else: self.mention_count = {} for egouser in self.egousers: mentioncounts = {} for friend in self.friendList[egouser]: if friend in self.mention_count and egouser in self.mention_count[friend]: mentioncounts[friend] = self.mention_count[friend][egouser] else: mentioncounts[friend] = self.dbAdapter.getMentionCount(egouser, friend) self.mention_count[egouser] = mentioncounts file_mentioncount = open(FILE_MENTIONCOUNT, "wb") pickle.dump(self.mention_count, file_mentioncount) file_mentioncount.close() print("\tCalculated Mention counts: " + str(sum([len(self.mention_count[egouser]) for egouser in self.egousers])) + " records") def getLikeCount(self): # Path to save dump file FILE_LIKECOUNT = StaticVariable.ROOTPATH + "like_count.pickle" print("Getting like count for a user...") if os.path.exists(FILE_LIKECOUNT) == True: file_likecount = open(FILE_LIKECOUNT, "rb") self.like_count = pickle.load(file_likecount) file_likecount.close() else: self.like_count = {} for egouser in self.egousers: likecounts = {} for friend in self.friendList[egouser]: likecounts[friend] = self.dbAdapter.getLikeCount(egouser, friend) self.like_count[egouser] = likecounts file_likecount = open(FILE_LIKECOUNT, "wb") pickle.dump(self.like_count, file_likecount) file_likecount.close() print("\tCalculated Like counts: " + str(sum([len(self.like_count[egouser]) for egouser in self.egousers])) + " records") def getMutualFriendsCount(self): # Path to save dump file FILE_MUTUALFRIENDS = StaticVariable.ROOTPATH + "mutual_friends_count.pickle" print("Getting mutual friends count for a user...") if os.path.exists(FILE_MUTUALFRIENDS) == True: file_mutualfriends = open(FILE_MUTUALFRIENDS, "rb") self.mutual_friends_count = pickle.load(file_mutualfriends) file_mutualfriends.close() else: self.mutual_friends_count = {} for egouser in self.egousers: mutualFriendsCount = {} for friend in self.friendList[egouser]: if friend in self.mutual_friends_count and egouser in self.mutual_friends_count[friend]: mutualFriendsCount[friend] = self.mutual_friends_count[friend][egouser] else: mutualFriendsCount[friend] = self.dbAdapter.getMutualFriendsCount(egouser, friend, self.friendList[egouser], self.friendList[friend]) self.mutual_friends_count[egouser] = mutualFriendsCount file_mutualfriends = open(FILE_MUTUALFRIENDS, "wb") pickle.dump(self.mutual_friends_count, file_mutualfriends) file_mutualfriends.close() print("\tCalculated Mutual friends counts: " + str(sum([len(self.mutual_friends_count[egouser]) for egouser in self.egousers])) + " records") def getID(self, index): if index < 0 or index >= len(self.validUsers): return -1 return self.validUsers[index] def loadClusters(self): # Path to save dump file FILE_CLUSTERS = StaticVariable.ROOTPATH + "clusters.pickle" if os.path.exists(FILE_CLUSTERS) == True: dumpfile_cluster = open(FILE_CLUSTERS, "rb") self.clusters = pickle.load(dumpfile_cluster) dumpfile_cluster.close() else: self.clusters = {} # {1: [[2, 3, 4], [6, 7, 8, 9]], 2: [[1, 3, 4], [5, 6, 7, 8]]} clusterFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/clusters/*") if len(clusterFiles) == 0: # Check if there is input files for clustering networkFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/*.pairs") if len(networkFiles) == 0: print("Making input files... ") # Make network file(*.pairs) from self.friendList{} for egouser in self.egousers: users = [] # users.append(egouser) for friend in self.friendList[egouser]: users.append(friend) file = open(StaticVariable.ROOTPATH + "FastModularity/" + str(self.indU[egouser]) + ".pairs", "w") for user in users: for friend in self.friendList[user]: file.write(str(self.indU[user]) + "\t" + str(self.indU[friend]) + "\n") file.close() print("\tdone!") # Do clustering by executing shell script print("Clustering... ") # subprocess.Popen([StaticVariable.ROOTPATH + "FastModularity/doClustering.sh"]).communicate() subprocess.Popen(['for file in ' + StaticVariable.ROOTPATH + 'FastModularity/*.pairs; ' 'do ' + StaticVariable.ROOTPATH + 'FastModularity/FastCommunityMH -f "$file"; done'], shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate() print("\tdone!") clusterFiles = glob.glob(StaticVariable.ROOTPATH + "FastModularity/clusters/*") # Load clustering information into memory for filepath in clusterFiles: ind = int(os.path.basename(filepath)) egouser = self.getID(ind) file_cluster = open(filepath, "r") while (True): line = file_cluster.readline() if not line: break tokens = line.split("\t") memberset = [] for token in tokens: try: memberID = self.getID(int(token)) if memberID != egouser: memberset.append(memberID) except: continue if egouser not in self.clusters.keys(): self.clusters[egouser] = [] self.clusters[egouser].append(memberset) file_cluster.close() dumpfile_cluster = open(FILE_CLUSTERS, "wb") pickle.dump(self.clusters, dumpfile_cluster) dumpfile_cluster.close() nClusters = 0 for egouser in self.clusters: nClusters += len(self.clusters[egouser]) print("Loading clusters...") print("\t" + str(nClusters) + " clusters of " + str(len(self.egousers)) + " ego networks are loaded.") print() def show(self): plt.show()