class BalanedData: def __init__(self, filterSize, hashCount, clickedUsers): self.allData = [] self.clickedCounter = len(clickedUsers) self.noClickedCounter = 0 self.collectedDataUsersFilter = BloomFilter(filterSize, hashCount) self.__addUsers(clickedUsers) def __addUsers(self, clickedUsersIds): for userId in clickedUsersIds: self.__addUser(userId) def __addUser(self, userId): self.collectedDataUsersFilter.add(userId) def addUserRow(self, userId, row): isCollected = self.collectedDataUsersFilter.contains(userId) if isCollected: self.allData.append(row) elif self.clickedCounter > self.noClickedCounter: self.__addUser(userId) self.noClickedCounter += 1 self.allData.append(row)
def sampleData(file1, file2, column): filter = BloomFilter(13419082, 23) firstUsersIds1 = userIds(file1, column) for user in firstUsersIds1: filter.add(str(user)) firstUsersIds2 = userIds(file2, 'fc20') same = 0 diff = 0 for user in firstUsersIds2: if filter.contains(str(user)): same += 1 else: diff += 1 return same, diff