def computeBiggestCoordinatesFromResults(self, resCandidates):
        ''' Given all the results, compute the biggest group and return a random coordinates inside 
		'''
        # results e.g. [ [5.0, [lat,lon], "tags"] ]
        newResCandidates = ResCandidates()
        coordList = []
        for res in resCandidates.getList():
            # get coordinates in string format: for [computeBiggestGroupOfCoordinates]
            strCoo = res.getCoordStr()
            coordList.append(strCoo)
#			coo = res.getCoord()	# for [computeYaelKmeansOfCoordinates]
#			coordList.append( coo )
# Print input coordinates list
        if self.verbose:
            print >> sys.stderr, ""
            print >> sys.stderr, "computeBiggestCoordinatesFromResults] coordList:", coordList
        # Compute the biggest group
        centerCoo, sumOfD = self.computeBiggestGroupOfCoordinates(coordList)
        #		centerCoo = self.computeYaelKmeansOfCoordinates( coordList )
        # find the relative "result" with all the info
        if len(centerCoo) > 0:
            for res in resCandidates.getList():
                coord = res.getCoord()
                if coord == centerCoo:
                    newResCandidates.addObj(res)
                    return newResCandidates
        # otherwise something went wrong
        newResCandidates.add(0.0, self.defaultCoordinates, "no results")
        return newResCandidates
 def getContentVideoResults(self, filename):
     resCandidates = ResCandidates()
     if self.FnRes.has_key(filename):
         # get list of coordiantes
         listOfCoo = self.FnRes[filename]
         # Select the best coo among the ones in the listOfCoo
         coo, sumOfD = self.identifyCloserCoordinatesInTuple(listOfCoo)
         # Add the best coordinattes in the result
         resCandidates.add(sumOfD, coo, "_byContentMatching")
     return resCandidates
    def getUserMostProbableLocation(self, uId):
        ''' Return the most probable location of the userId Given
		'''
        resCandidates = ResCandidates()
        if not self.UserPlaces.has_key(uId):
            return resCandidates
        # check the coordinates with highest frequency
        maxFreq = 0
        bestCoord = self.defaultCoordinates
        for coordFreq in self.UserPlaces[uId]:
            lat = coordFreq[0]
            lon = coordFreq[1]
            acc = coordFreq[2]
            freq = coordFreq[3]
            if max(maxFreq, freq) == freq:
                bestCoord = [lat, lon]
        if bestCoord != self.defaultCoordinates:
            resCandidates.add(-1, bestCoord, uId + "_userPlaces")
#			resCandidates = [[ -1, bestCoord, uId+"_userPlaces" ]]
        return resCandidates
    def multithreadTestVideosParser(self, line, varLock=""):
        '''
		'''
        methodUsed = "All Methods"
        blk = line.strip().split("\t")
        # get the tags
        tags = clean_line(blk[3])
        mtags = clean_line(blk[4])
        print >> sys.stderr, "---"
        print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags, mtags)
        # apply tags filters -> mtags contain the geo machine tags
        #			if len(mtags) > 1:
        #				tags += " "+ mtags # give more power to the geo tags
        # Read Real Location: "45.516639|-122.681053|16|Portland|Oregon|etats-Unis|16"
        location = blk[5].strip()
        geo = location.split("|")
        acc = geo[2]
        realCoord = [float(geo[0]), float(geo[1])]
        resCandidates = ResCandidates()

        # 1st METHOD: check tags in training set
        if len(tags) > 1:
            methodUsed = "TrainSet"
            resCandidates = self.matchWithGroupsOfTags(tags, mtags)
        else:
            self.updateLockVariables(methodUsed)
        # print out info
        if self.verbose:
            print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(
            ), methodUsed

        # 2nd METHOD: (if there are NOT RESULTS) use the most probable location for this user
        if resCandidates.size() == 0:
            methodUsed = "UserCommonLocation"
            ownGeo = blk[6].split("|")
            ownId = ownGeo[0]
            resCandidates = self.getUserMostProbableLocation(ownId)
            if resCandidates.size() > 0:
                self.updateLockVariables(methodUsed)
            # print out info
            if self.verbose:
                print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(
                ), methodUsed

        # 3rd METHOD: (if there are NOT RESULTS) USE the USER HOMETOWN as tags
        if resCandidates.size() == 0:
            methodUsed = "HomeTown"
            # Owner location: ['14678786@N00', 'milwaukee, United States']
            ownGeo = blk[6].split("|")
            # prepare hometown tags
            ht = ownGeo[1].split(",")
            hometownTags = ""
            for w in ht:
                if len(w) > 1:
                    hometownTags += w + " "
            hometownTags = hometownTags[:-1]
            if (hometownTags) >= 1:
                # check with the new set of tags
                resCandidates = self.matchWithGroupsOfTags(hometownTags, "")
                if resCandidates.size() > 0:
                    self.updateLockVariables(methodUsed)
                    # print out info
                if self.verbose:
                    print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(
                    ), methodUsed

        # 4th METHOD: (if there are NOT RESULTS) define zero position
        if resCandidates.size() == 0:
            methodUsed = "DefaultCoordinates"
            self.updateLockVariables(methodUsed)
            hometown = False
            resCandidates.add(-1.0, self.defaultCoordinates, "no results")
            # print out info
            if self.verbose:
                print >> sys.stderr, "*> parseTestVideos] defined default coordinates"

        # return the bigger coordinates group
        if resCandidates.size() > 2:
            # print out info
            if verbose:
                print >> sys.stderr, "*> parseTestVideos] BiggestCoordinatesGroup: from %d results got 1" % resCandidates.size(
                )
                candies = []
                for res in resCandidates.getList():
                    candies.append(
                        str(res.getScore()) + "," + str(res.getCoord()[0]) +
                        "|" + str(res.getCoord()[1]))
            resCandidates = self.computeBiggestCoordinatesFromResults(
                resCandidates)
            # print out info
            if self.verbose:
                for can in candies:
                    print >> sys.stderr, can
                print >> sys.stderr, "*> parseTestVideos] FINAL => ", resCandidates.getList(
                )[0].getCoord()

        # write the buffer into a file
        self.serialize(resCandidates, realCoord, methodUsed)
        print >> sys.stderr, "---\n"
    def parseTestVideos(self, withMtags=True, officialRun=False):
        ''' Given the TestSet file, read and parse each video meta-data,
			select the tags and retrieve the most suitable places for those tags
		'''
        print >> sys.stderr, "*> [SelectCoordMethod: %s] [ScoreMetric: %s] [MatchingType: %s]" % (
            self.selectCoordMethod, self.scoreMetric, self.matchingType)
        print >> sys.stderr, "*> [TestSet]",
        t1 = time.time()
        userMostProbableLocation = 0
        userHomeCoordinates = 0
        nullCoordinates = 0
        contentbased = 0
        noTags = 0
        totLines = 4532.0
        lines = 0
        # videoId <t> title <t> url <t> tags <t> mtags <t> location <t> ownerLocation
        for line in self.TestFile:
            lines += 1
            print >> sys.stderr, "---"
            methodUsed = "All Methods"
            ################
            # get filename and url
            blk = line.strip().split("\t")
            url = blk[2].strip()
            filename = blk[1].split("|")[0]
            if '.jpg' in filename:
                filename.replace('.jpg', '')
            ################
            # get the tags, mtags, geo and ownGeo
            tags, mtags, geo, ownGeo = self.getInfoFromTestVideo(
                line, withMtags)
            # print info
            #			if type(tags) == 'unicode':
            #				print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags.encode('utf-8','ignore'), mtags.encode('utf-8','ignore'))
            #			else:
            #				print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags, mtags)
            realCoord = [float(geo[0]), float(geo[1])
                         ] if not officialRun else ""
            resCandidates = ResCandidates()

            # 1st METHOD: check tags in training set
            if len(tags) > 1:
                methodUsed = "TrainSet"
                resCandidates = self.matchWithGroupsOfTags(tags, mtags)
            else:
                noTags += 1
            # print out info
            if self.verbose:
                print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(
                ), methodUsed

            # 2nd METHOD: (if there are NOT RESULTS) use the most probable location for this user
            if resCandidates.size() == 0:
                methodUsed = "UserCommonLocation"
                ownId = ownGeo[0]
                resCandidates = self.getUserMostProbableLocation(ownId.lower())
                if resCandidates.size() > 0:
                    print >> sys.stderr, "*> Used UserCommonLocation for user %s" % ownId
                    userMostProbableLocation += 1
                else:
                    print >> sys.stderr, "*> NO UserCommonLocation for user %s" % ownId
                # print out info
                if self.verbose:
                    print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(
                    ), methodUsed

            # 3rd METHOD: (if there are NOT RESULTS) USE the USER HOMETOWN as tags
            if resCandidates.size() == 0:
                methodUsed = "HomeTown"
                # prepare hometown tags
                ht = ownGeo[1].split(",")
                hometownTags = ""
                for w in ht:
                    if len(w) > 1:
                        hometownTags += w + " "
                hometownTags = hometownTags[:-1]
                if (hometownTags) < 2:
                    continue
                # check with the new set of tags
                resCandidates = self.matchWithGroupsOfTags(hometownTags, "")
                if resCandidates.size() > 0:
                    userHomeCoordinates += 1
                # print out info
                if self.verbose:
                    print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(
                    ), methodUsed

            # 4th METHOD: Check Content-Video results
            if resCandidates.size() == 0:
                resCandidates = self.getContentVideoResults(filename)
                if resCandidates.size() > 0:
                    print >> sys.stderr, "*> Used Content-Based Approach"
                    contentbased += 1
                else:
                    print >> sys.stderr, "*> NO Content-Based Information"

            # 4th METHOD: (if there are NOT RESULTS) define zero position
            if resCandidates.size() == 0:
                methodUsed = "DefaultCoordinates"
                nullCoordinates += 1
                hometown = False
                resCandidates.add(-1.0, self.defaultCoordinates, "no results")
                # print out info
                if self.verbose:
                    print >> sys.stderr, "*> parseTestVideos] defined default coordinates"

            # return the bigger coordinates group
            if resCandidates.size() > 2:
                # print out info
                if verbose:
                    print >> sys.stderr, "*> parseTestVideos] BiggestCoordinatesGroup: from %d results got 1" % resCandidates.size(
                    )
                    candies = []
                    for res in resCandidates.getList():
                        candies.append(
                            str(res.getScore()) + "," +
                            str(res.getCoord()[0]) + "|" +
                            str(res.getCoord()[1]))
                resCandidates = self.computeBiggestCoordinatesFromResults(
                    resCandidates)
                # print out info
                if self.verbose:
                    for can in candies:
                        print >> sys.stderr, can
                    print >> sys.stderr, "*> parseTestVideos] FINAL => ", resCandidates.getList(
                    )[0].getCoord()

            # write the buffer into a file
            if not officialRun:
                self.serialize(resCandidates, realCoord, methodUsed)
                self.serializeFormatted(resCandidates, filename, url)
            else:
                self.serializeFormatted(resCandidates, filename, url)
            # print info
            if (lines % 100) == 0:
                print >> sys.stderr, "\r*> [TestSet] %2.2f%s parsed [%d usrLoc, %d usrHome, %d content-based, %d defCoordinates, %d noTags]" % (
                    (lines / totLines * 100), '%', userMostProbableLocation,
                    userHomeCoordinates, contentbased, nullCoordinates,
                    noTags),
        # final info
        t2 = time.time()
        print >> sys.stderr, "\r*> [TestSet]  %d videos [%d userLocation, %d hometown, %d content-based, %d defaultCoordinates, %d with no tags] ~%.2fs" % (
            lines, userMostProbableLocation, userHomeCoordinates, contentbased,
            nullCoordinates, noTags, (t2 - t1) * 1.0)
        # compute the average of the statistics
        print >> sys.stderr, "---"
        if not officialRun:
            for k in self.limits:
                p = float(self.stats[k]) / lines * 100
                print >> sys.stderr, "*> [TestSet] %d videos (%.2f%s) inside a radius of %dkm" % (
                    self.stats[k], p, '%', k)
        # close test file
        self.TestFile.close()
    def matchWithGroupsOfTags(self, tagsLine, mtagsLine):
        ''' Compare the Test Video Tags and MTags with the TrainSet and GeoNames DB.
			Higher priority to the mtags (if there are, I filter the GroupOfTags just with matches of mtags),
			Otherwise, same procedure for tags. After, we recheck all the matches, and compute the number of 
			tag included in each GroupOfTags -> computing score.
		'''
        # create output with all the top Result Candidates
        resCandidates = ResCandidates()
        matches = MatchCandidates()
        tags = tagsLine.strip().split(" ")

        ########################################################
        # Get the GoT where the index matches with mtag or tag #
        ########################################################
        # If there are Machine Tags, retrieve the GroupOfTags just from them
        if len(mtagsLine.strip()) > 2:
            mtags = mtagsLine.strip().split(" ")
            tags = tags + mtags  # Merge the mtags with the tags
            for mtag in mtags:
                # Check the TrainSet
                if self.TrainTagIndex.has_key(mtag):
                    for groupOfTags in self.TrainTagIndex[mtag]:
                        matches.update(groupOfTags, 0)
                # Check the GeoNames
                if self.GeoNamesIndex.has_key(mtag):
                    for groupOfTags in self.GeoNamesIndex[mtag]:
                        matches.update(groupOfTags, 0)
        # If the mtags didn't find any match, use the tags
        if matches.size() == 0:
            # Create the list of GroupOfTags containing the TestTags
            for tag in tags:
                # Check the TrainSet
                if self.TrainTagIndex.has_key(tag):
                    for groupOfTags in self.TrainTagIndex[tag]:
                        matches.update(groupOfTags, 0)
                # Check the GeoNames
                if self.GeoNamesIndex.has_key(tag):
                    for groupOfTags in self.GeoNamesIndex[tag]:
                        matches.update(groupOfTags, 0)

        # if there are no matches, return 0
        if matches.size() == 0:
            return resCandidates

        ###################################################################
        # Count the number of matches between the tags+mtags and the candidates #
        ###################################################################
        # Checking how many matches there are in all the GoT
        for tag in tags:
            for groupOfTags in matches.getKeys():
                gotSplit = groupOfTags.strip().split(" ")
                # check a partial match inside the entire groupOfTags
                if matchingType == 'partial':
                    if tag in groupOfTags:
                        matches.update(groupOfTags, 1)
                # check the match for every tag inside the groupOfTags
                elif matchingType == 'perfect':
                    for got in gotSplit:
                        # perfect match = exact
                        if tag == got:
                            matches.update(groupOfTags, 1)
#								break

# computing the score for each groups of tags
        matches.computeScores(self.scoreMetric, len(tags))
        # get the scores of the topN matches
        maxScores = matches.getTopNScores(self.topn)
        # get the keys of the topnN matches
        topnKeys = matches.getKeysWithGivenScores(maxScores)
        if self.verbose:
            print >> sys.stderr, "matchWithGroupOfTags] we are here, maxScores: %d, topNelements: %d" % (
                maxScores[0], len(topnKeys))

        ########################################################################
        # From all the topN candidates compute the most likelihood coordinates #
        ########################################################################
        # for all the topN matches, return the most likelihood coordinates
        for got in topnKeys:
            # check if the key is from TrainTags
            if self.TrainTagCoord.has_key(got):
                coord = self.getMostLikelihoodCoordinates(
                    self.TrainTagCoord[got])
# check if the key is from GeoNames
            elif self.GeoNames.has_key(got):
                coord = self.getMostLikelihoodCoordinates(self.GeoNames[got])
            else:
                continue
            # if coord is empty
            if len(coord) == 0:
                continue
            # update the results
            resCandidates.add(matches.getValue(got), coord, got)
        # return all the results
        return resCandidates