示例#1
0
 def testGoldenSectionSearch(self, regionCenter, expertise):
     expertRegion = Region(
         (45, -129), (25, -50), center=regionCenter, name="Test Region", isParent=True, expertise=expertise
     )
     self._usersBucket = BucketUsers(expertRegion, Settings.bucketingInterval)
     self._usersBucket.printBuckets()
     print self._goldenSectionSearch(expertRegion)
示例#2
0
 def testLikelihood(self, regionCenter, expertise, C, alpha):
     expertRegion = Region(
         (45, -129), (25, -50), center=regionCenter, name="Test Region", isParent=True, expertise=expertise
     )
     self._usersBucket = BucketUsers(expertRegion, Settings.bucketingInterval)
     print "likelihood value:", self._likelihoodFunction(C, alpha, expertRegion)
示例#3
0
class BackStromExpertiseModelGenerator:
    _dictExpertUsersData = {}
    _dictExpertiseRegions = {"tech": [Region((0, 0), (0, 0))]}
    _dictExpertModels = {"tech": [{"C": 0.8, "alpha": 2.9, "center": (23, -67.9)}]}
    _dataDirectory = ""
    _expertDataExtractor = None
    _usersClusterer = None
    _usersBucket = None

    def __init__(self, dataDirectory, dataExtractor=None):
        self._dictExpertiseRegions.clear()
        self._dictExpertModels.clear()
        self._dataDirectory = dataDirectory
        if dataExtractor == None:
            self._expertDataExtractor = DataExtractorFactory.getDataExtractor("expertisemodel", self._dataDirectory)
            self._expertDataExtractor.populateData(self._dataDirectory)
        else:
            self._expertDataExtractor = dataExtractor

        self._dictExpertUsersData = self._expertDataExtractor.getAllExpertsData()
        self._createParentRegions()
        UsersData.addUserDataToRegions(self._dictExpertUsersData)

    """
    Calculates the likelihood value for Backstorm model p = C*d^-alpha
    @param C: The value of parameter C
    @param alpha: The value of parameter C
    @param childRegion: The childRegion for which we want to calculate the model
    """

    def _likelihoodFunction(self, C, alpha, childRegion):
        expertValueSum = 0
        nonExpertValueSum = 0

        # We will use all the users data for a given expertise's parent childRegion
        # to calculate the log likelihood about the center of a
        # given child childRegion.

        usersDataBuckets = self._usersBucket.getUsersBucket()
        if childRegion.getName() != self._usersBucket._region.getName():
            print childRegion.getName(), " is using the bucket for ", self._usersBucket._region.getName()

        for bucketKey in usersDataBuckets:
            usersBucket = usersDataBuckets[bucketKey]

            # We calculate the maximum likelihood for the Backstorm model
            # using the center of this childRegion as the center of the model
            expertModelValue, nonExpertModelValue = 0, 0

            if "expert" in usersBucket:
                expertUsersBucket = usersBucket["expert"]
                expertUsersConfidenceSum = expertUsersBucket["confidenceSum"]
                expertUserDistance = expertUsersBucket["averageDistance"]
                # expertUserCount = expertUsersBucket['usersCount']
                # try:
                if expertUserDistance <= 1:
                    expertUserDistance = 1.01
                expertModelValue = expertUsersConfidenceSum * log(C * pow(expertUserDistance, -alpha))
                # except:
                # print 'Expert: distance->', expertUserDistance, ' confidence->' , expertUsersConfidenceSum
                # print 'usersBucketKey', bucketKey
                # raise

            if "nonexpert" in usersBucket:
                nonExpertUsersBucket = usersBucket["nonexpert"]
                nonExpertUsersConfidenceSum = nonExpertUsersBucket["confidenceSum"]
                nonExpertUserDistance = nonExpertUsersBucket["averageDistance"]
                # nonExpertUserCount = nonExpertUsersBucket['usersCount']
                # try:
                if nonExpertUserDistance <= 1:
                    nonExpertUserDistance = 1.01
                nonExpertModelValue = 1 - C * pow(nonExpertUserDistance, -alpha)
                nonExpertModelValue = nonExpertUsersConfidenceSum * log(nonExpertModelValue)
            # except:
            # print 'Non Expert: distance->', nonExpertUserDistance, ' confidence->' , nonExpertUsersConfidenceSum
            # print 'usersBucketKey', bucketKey
            # raise
            # print 'Expert Value:', expertModelValue
            expertValueSum += expertModelValue
            nonExpertValueSum += nonExpertModelValue
        # print 'C = ', C, 'alpha = ', alpha
        # print 'Expert likelihood component:' , expertValueSum
        # print 'non Expert likelihood component:' , nonExpertValueSum
        functionValue = expertValueSum + nonExpertValueSum
        # print functionValue,' is the total likelihood value'
        return functionValue

    """
    Calculates the alpha for maximum value of likelihood function for 
    childRegion for a fixed value of C
    @param C: The value of parameter C
    @param childRegion: The childRegion for which we want to calculate the model
    """

    def _goldenSectionSearchForAlpha(self, C, childRegion):
        # Golden section search for alpha parameter for a fixed C
        goldenRatio = 0.618
        aAlpha = Settings.alphaMin
        bAlpha = Settings.alphaMax

        while bAlpha - aAlpha > Settings.alphaTolerance:
            x1Alpha = aAlpha + (1 - goldenRatio) * (bAlpha - aAlpha)
            x2Alpha = aAlpha + goldenRatio * (bAlpha - aAlpha)

            fx1Alpha = self._likelihoodFunction(C, x1Alpha, childRegion)
            fx2Alpha = self._likelihoodFunction(C, x2Alpha, childRegion)

            if fx1Alpha >= fx2Alpha:
                bAlpha = x2Alpha
            else:
                aAlpha = x1Alpha

        argmaxAlpha = (aAlpha + bAlpha) / 2
        maxLikelihoodValue = self._likelihoodFunction(C, argmaxAlpha, childRegion)
        return (argmaxAlpha, maxLikelihoodValue)

    """
    Calculates the alpha for maximum value of likelihood function for 
    childRegion and returns the tuple (C, alpha, maxLikelihoodValue)
    @param C: The value of parameter C
    @param childRegion: The childRegion for which we want to calculate the model
    """

    def _goldenSectionSearch(self, childRegion):
        # Golden section search for C and alpha parameters
        goldenRatio = 0.618
        aC = Settings.Cmin
        bC = Settings.Cmax

        while bC - aC > Settings.Ctolerance:
            x1C = aC + (1 - goldenRatio) * (bC - aC)
            x2C = aC + goldenRatio * (bC - aC)

            fx1C = self._goldenSectionSearchForAlpha(x1C, childRegion)[1]
            fx2C = self._goldenSectionSearchForAlpha(x2C, childRegion)[1]

            if fx1C >= fx2C:
                bC = x2C
            else:
                aC = x1C

        argmaxC = (aC + bC) / 2
        (argmaxAlpha, maxLikelihoodValue) = self._goldenSectionSearchForAlpha(argmaxC, childRegion)
        return (argmaxC, argmaxAlpha, maxLikelihoodValue)

    """
    Just a wrapper method which hides how exactly the maximum likehood value is 
    calculated.
    """

    def _calculateMaxLogLikelihood(self, region):
        self._usersBucket = BucketUsers(region, Settings.bucketingInterval)
        # print 'prepared bucket for ', region.getName()
        # self._usersBucket.printBucket()
        value = self._goldenSectionSearch(region)
        return value

    """
    Finds the childRegion of the given parentRegion for which it 
    is maximum likely that the center of the Backstorm model will 
    lie in the center of this childRegion. This method calculates the 
    likelihood values for  all the children region then return the one 
    with maximum likelihood value.
    
    @param parentRegion: The parent region.
    @return : A tuple containing the mleRegion, argmaxC, argmaxAlpha in that order.
    mleRegion : The maximum likely region.
    argmaxC : The value of C which maximizes the likelihood function for the given 
    mleRegion.
    argmaxAlpha: The value of alpha which maximizes the likelihood function for the given 
    mleRegion.
    """

    def _getMaximumLikelyChildRegion(self, parentRegion):
        # print 'calculating maximum likelihood for all regions in'+ parentRegion.getName() +' and getting the maximum parentRegion'
        previousMaxLikelihoodValue = maxLikelihoodValue = -9999999999999
        (argmaxC, argmaxAlpha) = (0, 0)
        mleRegion = None
        for childrenRegionRow in parentRegion.getChildRegions():
            for childRegion in childrenRegionRow:
                (argmaxC, argmaxAlpha, value) = self._calculateMaxLogLikelihood(childRegion)
                # print 'Got value =' , value
                maxLikelihoodValue = max(maxLikelihoodValue, value)
                # print 'Got mle Value = ', maxLikelihoodValue
                if maxLikelihoodValue > previousMaxLikelihoodValue:
                    # print maxLikelihoodValue, ' is greater than ', previousMaxLikelihoodValue
                    mleRegion = childRegion
                    previousMaxLikelihoodValue = maxLikelihoodValue
        return (mleRegion, argmaxC, argmaxAlpha)

    """
    Calculates the Backstorm for a region and puts the calculted result into the queue.
    @param expertiseRegion: This is the region for which the backstorm model is calculated.
    @param queue: The queue in which the calculated model's data is pushed.
    """

    def _computeModel(self, expertiseRegion, queue):
        region = expertiseRegion
        childRegionVerticalSize = Settings.initialChildRegionVerticalSize
        childRegionHorizontalSize = Settings.initialChildRegionHorizontalSize
        mleRegion = None
        print "Computing model for", region.getName()
        while childRegionHorizontalSize >= Settings.minChildRegionSize:
            region.segmentByChildSize(childRegionHorizontalSize, childRegionVerticalSize)
            mleRegion, argmaxC, argmaxAlpha = self._getMaximumLikelyChildRegion(region)
            # print mleRegion.getName(), ' has the largest mle value', 'C = ', argmaxC, ' alpha = ', argmaxAlpha
            region = mleRegion
            childRegionHorizontalSize = childRegionHorizontalSize / 2
            childRegionVerticalSize = childRegionVerticalSize / 2

        # Putting the resultant computed model into the queue
        dictResult = {
            "C": argmaxC,
            "alpha": argmaxAlpha,
            "center": mleRegion.getCenter(),
            "regionName": expertiseRegion.getName(),
        }
        # print 'The resultant model is :', dictResult
        queue.put(dictResult)

    """
    Spawn a subprocess for every region in the provided 'expertiseRegions' list
    and performs parallel computation then puts the results in to the provided 
    dataQueue asynchronously.
    @param expertiseRegions: A list of regions for which we want to compute the 
    backstorm model
    
    @param dataQueue: A queue which used by all the subprocess to store their results
    """

    def _computeModelsParallely(self, expertiseRegions, dataQueue):
        processes = []
        for expertiseRegion in expertiseRegions:
            processName = expertiseRegion.getName() + "process"
            process = Process(target=self._computeModel, args=(expertiseRegion, dataQueue), name=processName)
            processes.append(process)

        for process in processes:
            process.start()
        return processes

    """
    The method reads the dataQueue and populates the '_dictExpertModels' dictionary
    with the results present in the dataQueue.
    @param expertise: The expertise for which the model data is residing in the dataQueue.
    @param dataQueue: The queue in which the model data is present
    """

    def _populateResultsFromQueue(self, expertise, dataQueue):
        # All the children processes have finished their computation now
        # lets populate the dictionary with the results that they have
        # pushed into the queue.
        while not dataQueue.empty():
            modelDict = dataQueue.get()
            if expertise in self._dictExpertModels:
                # print 'Added Entry to dictionary for ', modelDict['regionName']
                self._dictExpertModels[expertise].append(modelDict)
            else:
                # print 'Appended entry for', modelDict['regionName']
                self._dictExpertModels[expertise] = [modelDict]

    """
    Checks if all processes have finished computation if yes then returns 
    otherwise sleeps.
    """

    def _waitForProcesses(self, processes):
        #  Here the current process sleeps while the subprocesses
        #  are busy in computation
        while True:
            areAlive = False
            for process in processes:
                areAlive = areAlive or process.is_alive()
            if areAlive == False:
                break
            time.sleep(2)

    """
    Simply a wrapper over _calculateMaxLogLikelihood to support
    multiprocessing.
    """

    def _computeLogLikeliHood(self, expertiseRegion, dataQueue):
        (argmaxC, argmaxAlpha, maxLikelihoodValue) = self._calculateMaxLogLikelihood(expertiseRegion)
        # Putting the resultant computed model into the queue
        dictResult = {
            "C": argmaxC,
            "alpha": argmaxAlpha,
            "center": expertiseRegion.getCenter(),
            "regionName": expertiseRegion.getName(),
        }
        dataQueue.put(dictResult)

    """
    Spawn a subprocess for every region in the provided 'expertiseRegions' list
    and performs parallel computation then puts the results in to the provided 
    dataQueue asynchronously.
    @param expertiseRegions: A list of regions for which we want to compute the 
    backstorm model
    
    @param dataQueue: A queue which used by all the subprocess to store their results
    """

    def _computeLogLikelihoodValuesParallely(self, expertiseRegions, dataQueue):
        processes = []

        for expertiseRegion in expertiseRegions:
            processName = expertiseRegion.getName() + "process"
            process = Process(target=self._computeLogLikeliHood, args=(expertiseRegion, dataQueue), name=processName)
            processes.append(process)

        for process in processes:
            process.start()

        self._waitForProcesses(processes)
        return processes

    def _optimizeParameters(self, expertise):
        expertiseRegions = self._dictExpertiseRegions[expertise]
        dataQueue = Queue(len(expertiseRegions))
        self._computeLogLikelihoodValuesParallely(expertiseRegions, dataQueue)
        self._populateResultsFromQueue(expertise, dataQueue)

    def _updateExpertiseRegions(self, expertise):
        dictRegionPartition = {}
        usersData = UsersData.getUsersData()
        for userData in usersData:
            regionName = userData[5]
            if regionName in dictRegionPartition:
                dictRegionPartition[regionName].append(userData)
            else:
                dictRegionPartition[regionName] = [userData]

        print "Regions: ", dictRegionPartition.keys()
        dictCenters = {}
        for model in self._dictExpertModels[expertise]:
            dictCenters[model["regionName"]] = model["center"]

        expertRegions = []
        for regionName in dictRegionPartition:
            usersData = dictRegionPartition[regionName]
            print regionName, " has ", len(usersData), " users assigned to it out of", len(
                UsersData.getUsersData()
            ), " users"
            leftTop, rightBottom = self._getBoundingBox(usersData)
            try:
                expertRegion = Region(
                    leftTop,
                    rightBottom,
                    center=dictCenters[regionName],
                    name=regionName,
                    isParent=True,
                    expertise=expertise,
                )

                expertRegions.append(expertRegion)
            except:
                print "Region invalid.. discarded!!"
        self._dictExpertiseRegions[expertise] = expertRegions

    def _displayRegionsInfo(self, expertise):
        expertiseRegions = self._dictExpertiseRegions[expertise]
        models = self._dictExpertModels[expertise]
        # map_plots.plotRegion(expertiseRegions, models)

    """
    Generates models for given expertises regions
    @param expertise: The expertise for which we want to generate Backstorm models.
    """

    def generateModelForExpertise(self, expertise):
        print "Generating model for", expertise
        # extracting the regions for the given expertise
        self._optimizeParameters(expertise)
        expertiseRegions = self._dictExpertiseRegions[expertise]
        dataQueue = Queue(len(expertiseRegions))
        print "Initially computed models:-"
        pprint(self._dictExpertModels)
        iternum = 0
        while iternum < Settings.numberOfIterations:
            UsersData.partitionUsers(self._dictExpertModels[expertise], expertise)
            print "Recomputed Bounding boxes for regions-----"
            self._updateExpertiseRegions(expertise)
            self._displayRegionsInfo(expertise)
            self._dictExpertModels.clear()
            expertiseRegions = self._dictExpertiseRegions[expertise]
            print "---- Recomputing the centers------------"
            start = time.time()
            processes = self._computeModelsParallely(expertiseRegions, dataQueue)
            self._waitForProcesses(processes)
            self._populateResultsFromQueue(expertise, dataQueue)
            print "It took ", start - time.time(), " seconds"
            print "---------------Generated multiple center model----------"
            pprint(self._dictExpertModels)
            iternum += 1

    def generateSingleCenterModel(self, expertise):
        expertiseRegion = self._dictExpertiseRegions[expertise][0]
        dataQueue = Queue(1)
        self._computeModel(expertiseRegion, dataQueue)
        self._populateResultsFromQueue(expertise, dataQueue)
        self._dictExpertModels[expertise] = dataQueue.get()
        pprint(self._dictExpertModels)

    """
    Generates models for all expertise extracted from the users data
    """

    def generateModelForAllExpertise(self):
        print "Generating model for all expertise"
        for expertise in self._dictExpertiseRegions:
            self.generateModelForExpertise(expertise)

    """
    Creates bounding regions for all the expertise extracted from the users data
    """

    def _createParentRegions(self):
        print "creating parent Regions"
        for expertise in self._dictExpertUsersData:
            self._initializeExpertRegions(expertise)

    """
    Generates a random center for a given bounding region's lefttop and 
    right bottom coordinates
    
    @param leftTop: The coordinates tuple for the left top corner 
    of the region (latitude, longitude)
    @param rightBottom: The coordinates tuple for the right bottom 
    corner of the region (latitude, longitude)
    """

    def _getRandomCenter(self, leftTop, rightBottom):
        randomLattitude = randint(int(rightBottom[0] * 1000), int(leftTop[0] * 1000)) / 1000.0
        randomLongitude = randint(int(leftTop[1] * 1000), int(rightBottom[1] * 1000)) / 1000.0
        randomCenter = (randomLattitude, randomLongitude)
        return randomCenter

    """
    Generates the random centers such that they are uniformly distributed over the 
    targeted geographical region
    """

    def _getRandomCenters(self, leftTop, rightBottom):
        dummyRegion = Region(leftTop, rightBottom)
        childRegionCount = int(round(pow(Settings.numberOfcenters, 0.5)))
        dummyRegion.segmentByChildCount(childRegionCount, childRegionCount)
        randomCenters = []
        for childRegionRow in dummyRegion.getChildRegions():
            for childRegion in childRegionRow:
                randomCenter = self._getRandomCenter(childRegion.getLeftTop(), childRegion.getRightBottom())
                randomCenters.append(randomCenter)
        return randomCenters

    """
    Creates a bounding region for a certain expertise based on the
    location information in user data.
    @param expertise: The expertise for which we want to create a bounding
    region.
    @return: The bounding region corresponding to the expertise.
    """

    def _getBoundingBox(self, expertUsersData):
        maxLatitude = max(expertUsersData, key=itemgetter(2))[2]
        minLatiude = min(expertUsersData, key=itemgetter(2))[2]
        maxLongitude = max(expertUsersData, key=itemgetter(3))[3]
        minLongitude = min(expertUsersData, key=itemgetter(3))[3]
        leftTop = maxLatitude, minLongitude
        rightBottom = minLatiude, maxLongitude
        return leftTop, rightBottom

    def _initializeExpertRegions(self, expertise):

        expertUsersData = self._dictExpertUsersData[expertise]
        if len(expertUsersData) == 0:
            return
        leftTop, rightBottom = self._getBoundingBox(expertUsersData)
        expertRegions = []
        centers = self._getRandomCenters(leftTop, rightBottom)
        for index in range(len(centers)):
            center = centers[index]
            regionName = str(expertise) + " Region " + str(index)
            expertRegion = Region(
                leftTop, rightBottom, center=center, name=regionName, isParent=True, expertise=expertise
            )
            expertRegions.append(expertRegion)

        # print 'Coordinates : minlatitude = ', minLatiude, ', maxLatitude = ', maxLatitude
        # print 'minlongitude = ', minLongitude, ', maxlongitude = ', maxLongitude
        self._dictExpertiseRegions[expertise] = expertRegions

    """
    Gets the expertise for a bounding region
    @param region: The expertise corresponding to the provided bounding Region.
    """

    def _getExpertiseForRegion(self, region):
        return region.getName().split(" ")[0]

    def getModelsForExpertise(self, expertise):
        return self._dictExpertModels[expertise]

    def getModelsForAllExpertise(self):
        return self._dictExpertModels

    """
    Methods for Debugging and testing purposes.
    """

    def testGoldenSectionSearch(self, regionCenter, expertise):
        expertRegion = Region(
            (45, -129), (25, -50), center=regionCenter, name="Test Region", isParent=True, expertise=expertise
        )
        self._usersBucket = BucketUsers(expertRegion, Settings.bucketingInterval)
        self._usersBucket.printBuckets()
        print self._goldenSectionSearch(expertRegion)

    def testLikelihood(self, regionCenter, expertise, C, alpha):
        expertRegion = Region(
            (45, -129), (25, -50), center=regionCenter, name="Test Region", isParent=True, expertise=expertise
        )
        self._usersBucket = BucketUsers(expertRegion, Settings.bucketingInterval)
        print "likelihood value:", self._likelihoodFunction(C, alpha, expertRegion)

    def display(self):
        print "Models Created as follows"
        pprint(self._dictExpertModels)
示例#4
0
 def _calculateMaxLogLikelihood(self, region):
     self._usersBucket = BucketUsers(region, Settings.bucketingInterval)
     # print 'prepared bucket for ', region.getName()
     # self._usersBucket.printBucket()
     value = self._goldenSectionSearch(region)
     return value