def computeBiggestCoordinatesFromResults(self, resCandidates): ''' Given all the results, compute the biggest group and return a random coordinates inside ''' # results e.g. [ [5.0, [lat,lon], "tags"] ] newResCandidates = ResCandidates() coordList = [] for res in resCandidates.getList(): # get coordinates in string format: for [computeBiggestGroupOfCoordinates] strCoo = res.getCoordStr() coordList.append(strCoo) # coo = res.getCoord() # for [computeYaelKmeansOfCoordinates] # coordList.append( coo ) # Print input coordinates list if self.verbose: print >> sys.stderr, "" print >> sys.stderr, "computeBiggestCoordinatesFromResults] coordList:", coordList # Compute the biggest group centerCoo, sumOfD = self.computeBiggestGroupOfCoordinates(coordList) # centerCoo = self.computeYaelKmeansOfCoordinates( coordList ) # find the relative "result" with all the info if len(centerCoo) > 0: for res in resCandidates.getList(): coord = res.getCoord() if coord == centerCoo: newResCandidates.addObj(res) return newResCandidates # otherwise something went wrong newResCandidates.add(0.0, self.defaultCoordinates, "no results") return newResCandidates
def getContentVideoResults(self, filename): resCandidates = ResCandidates() if self.FnRes.has_key(filename): # get list of coordiantes listOfCoo = self.FnRes[filename] # Select the best coo among the ones in the listOfCoo coo, sumOfD = self.identifyCloserCoordinatesInTuple(listOfCoo) # Add the best coordinattes in the result resCandidates.add(sumOfD, coo, "_byContentMatching") return resCandidates
def getUserMostProbableLocation(self, uId): ''' Return the most probable location of the userId Given ''' resCandidates = ResCandidates() if not self.UserPlaces.has_key(uId): return resCandidates # check the coordinates with highest frequency maxFreq = 0 bestCoord = self.defaultCoordinates for coordFreq in self.UserPlaces[uId]: lat = coordFreq[0] lon = coordFreq[1] acc = coordFreq[2] freq = coordFreq[3] if max(maxFreq, freq) == freq: bestCoord = [lat, lon] if bestCoord != self.defaultCoordinates: resCandidates.add(-1, bestCoord, uId + "_userPlaces") # resCandidates = [[ -1, bestCoord, uId+"_userPlaces" ]] return resCandidates
def multithreadTestVideosParser(self, line, varLock=""): ''' ''' methodUsed = "All Methods" blk = line.strip().split("\t") # get the tags tags = clean_line(blk[3]) mtags = clean_line(blk[4]) print >> sys.stderr, "---" print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags, mtags) # apply tags filters -> mtags contain the geo machine tags # if len(mtags) > 1: # tags += " "+ mtags # give more power to the geo tags # Read Real Location: "45.516639|-122.681053|16|Portland|Oregon|etats-Unis|16" location = blk[5].strip() geo = location.split("|") acc = geo[2] realCoord = [float(geo[0]), float(geo[1])] resCandidates = ResCandidates() # 1st METHOD: check tags in training set if len(tags) > 1: methodUsed = "TrainSet" resCandidates = self.matchWithGroupsOfTags(tags, mtags) else: self.updateLockVariables(methodUsed) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 2nd METHOD: (if there are NOT RESULTS) use the most probable location for this user if resCandidates.size() == 0: methodUsed = "UserCommonLocation" ownGeo = blk[6].split("|") ownId = ownGeo[0] resCandidates = self.getUserMostProbableLocation(ownId) if resCandidates.size() > 0: self.updateLockVariables(methodUsed) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 3rd METHOD: (if there are NOT RESULTS) USE the USER HOMETOWN as tags if resCandidates.size() == 0: methodUsed = "HomeTown" # Owner location: ['14678786@N00', 'milwaukee, United States'] ownGeo = blk[6].split("|") # prepare hometown tags ht = ownGeo[1].split(",") hometownTags = "" for w in ht: if len(w) > 1: hometownTags += w + " " hometownTags = hometownTags[:-1] if (hometownTags) >= 1: # check with the new set of tags resCandidates = self.matchWithGroupsOfTags(hometownTags, "") if resCandidates.size() > 0: self.updateLockVariables(methodUsed) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 4th METHOD: (if there are NOT RESULTS) define zero position if resCandidates.size() == 0: methodUsed = "DefaultCoordinates" self.updateLockVariables(methodUsed) hometown = False resCandidates.add(-1.0, self.defaultCoordinates, "no results") # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] defined default coordinates" # return the bigger coordinates group if resCandidates.size() > 2: # print out info if verbose: print >> sys.stderr, "*> parseTestVideos] BiggestCoordinatesGroup: from %d results got 1" % resCandidates.size( ) candies = [] for res in resCandidates.getList(): candies.append( str(res.getScore()) + "," + str(res.getCoord()[0]) + "|" + str(res.getCoord()[1])) resCandidates = self.computeBiggestCoordinatesFromResults( resCandidates) # print out info if self.verbose: for can in candies: print >> sys.stderr, can print >> sys.stderr, "*> parseTestVideos] FINAL => ", resCandidates.getList( )[0].getCoord() # write the buffer into a file self.serialize(resCandidates, realCoord, methodUsed) print >> sys.stderr, "---\n"
def parseTestVideos(self, withMtags=True, officialRun=False): ''' Given the TestSet file, read and parse each video meta-data, select the tags and retrieve the most suitable places for those tags ''' print >> sys.stderr, "*> [SelectCoordMethod: %s] [ScoreMetric: %s] [MatchingType: %s]" % ( self.selectCoordMethod, self.scoreMetric, self.matchingType) print >> sys.stderr, "*> [TestSet]", t1 = time.time() userMostProbableLocation = 0 userHomeCoordinates = 0 nullCoordinates = 0 contentbased = 0 noTags = 0 totLines = 4532.0 lines = 0 # videoId <t> title <t> url <t> tags <t> mtags <t> location <t> ownerLocation for line in self.TestFile: lines += 1 print >> sys.stderr, "---" methodUsed = "All Methods" ################ # get filename and url blk = line.strip().split("\t") url = blk[2].strip() filename = blk[1].split("|")[0] if '.jpg' in filename: filename.replace('.jpg', '') ################ # get the tags, mtags, geo and ownGeo tags, mtags, geo, ownGeo = self.getInfoFromTestVideo( line, withMtags) # print info # if type(tags) == 'unicode': # print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags.encode('utf-8','ignore'), mtags.encode('utf-8','ignore')) # else: # print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags, mtags) realCoord = [float(geo[0]), float(geo[1]) ] if not officialRun else "" resCandidates = ResCandidates() # 1st METHOD: check tags in training set if len(tags) > 1: methodUsed = "TrainSet" resCandidates = self.matchWithGroupsOfTags(tags, mtags) else: noTags += 1 # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 2nd METHOD: (if there are NOT RESULTS) use the most probable location for this user if resCandidates.size() == 0: methodUsed = "UserCommonLocation" ownId = ownGeo[0] resCandidates = self.getUserMostProbableLocation(ownId.lower()) if resCandidates.size() > 0: print >> sys.stderr, "*> Used UserCommonLocation for user %s" % ownId userMostProbableLocation += 1 else: print >> sys.stderr, "*> NO UserCommonLocation for user %s" % ownId # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 3rd METHOD: (if there are NOT RESULTS) USE the USER HOMETOWN as tags if resCandidates.size() == 0: methodUsed = "HomeTown" # prepare hometown tags ht = ownGeo[1].split(",") hometownTags = "" for w in ht: if len(w) > 1: hometownTags += w + " " hometownTags = hometownTags[:-1] if (hometownTags) < 2: continue # check with the new set of tags resCandidates = self.matchWithGroupsOfTags(hometownTags, "") if resCandidates.size() > 0: userHomeCoordinates += 1 # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 4th METHOD: Check Content-Video results if resCandidates.size() == 0: resCandidates = self.getContentVideoResults(filename) if resCandidates.size() > 0: print >> sys.stderr, "*> Used Content-Based Approach" contentbased += 1 else: print >> sys.stderr, "*> NO Content-Based Information" # 4th METHOD: (if there are NOT RESULTS) define zero position if resCandidates.size() == 0: methodUsed = "DefaultCoordinates" nullCoordinates += 1 hometown = False resCandidates.add(-1.0, self.defaultCoordinates, "no results") # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] defined default coordinates" # return the bigger coordinates group if resCandidates.size() > 2: # print out info if verbose: print >> sys.stderr, "*> parseTestVideos] BiggestCoordinatesGroup: from %d results got 1" % resCandidates.size( ) candies = [] for res in resCandidates.getList(): candies.append( str(res.getScore()) + "," + str(res.getCoord()[0]) + "|" + str(res.getCoord()[1])) resCandidates = self.computeBiggestCoordinatesFromResults( resCandidates) # print out info if self.verbose: for can in candies: print >> sys.stderr, can print >> sys.stderr, "*> parseTestVideos] FINAL => ", resCandidates.getList( )[0].getCoord() # write the buffer into a file if not officialRun: self.serialize(resCandidates, realCoord, methodUsed) self.serializeFormatted(resCandidates, filename, url) else: self.serializeFormatted(resCandidates, filename, url) # print info if (lines % 100) == 0: print >> sys.stderr, "\r*> [TestSet] %2.2f%s parsed [%d usrLoc, %d usrHome, %d content-based, %d defCoordinates, %d noTags]" % ( (lines / totLines * 100), '%', userMostProbableLocation, userHomeCoordinates, contentbased, nullCoordinates, noTags), # final info t2 = time.time() print >> sys.stderr, "\r*> [TestSet] %d videos [%d userLocation, %d hometown, %d content-based, %d defaultCoordinates, %d with no tags] ~%.2fs" % ( lines, userMostProbableLocation, userHomeCoordinates, contentbased, nullCoordinates, noTags, (t2 - t1) * 1.0) # compute the average of the statistics print >> sys.stderr, "---" if not officialRun: for k in self.limits: p = float(self.stats[k]) / lines * 100 print >> sys.stderr, "*> [TestSet] %d videos (%.2f%s) inside a radius of %dkm" % ( self.stats[k], p, '%', k) # close test file self.TestFile.close()
def matchWithGroupsOfTags(self, tagsLine, mtagsLine): ''' Compare the Test Video Tags and MTags with the TrainSet and GeoNames DB. Higher priority to the mtags (if there are, I filter the GroupOfTags just with matches of mtags), Otherwise, same procedure for tags. After, we recheck all the matches, and compute the number of tag included in each GroupOfTags -> computing score. ''' # create output with all the top Result Candidates resCandidates = ResCandidates() matches = MatchCandidates() tags = tagsLine.strip().split(" ") ######################################################## # Get the GoT where the index matches with mtag or tag # ######################################################## # If there are Machine Tags, retrieve the GroupOfTags just from them if len(mtagsLine.strip()) > 2: mtags = mtagsLine.strip().split(" ") tags = tags + mtags # Merge the mtags with the tags for mtag in mtags: # Check the TrainSet if self.TrainTagIndex.has_key(mtag): for groupOfTags in self.TrainTagIndex[mtag]: matches.update(groupOfTags, 0) # Check the GeoNames if self.GeoNamesIndex.has_key(mtag): for groupOfTags in self.GeoNamesIndex[mtag]: matches.update(groupOfTags, 0) # If the mtags didn't find any match, use the tags if matches.size() == 0: # Create the list of GroupOfTags containing the TestTags for tag in tags: # Check the TrainSet if self.TrainTagIndex.has_key(tag): for groupOfTags in self.TrainTagIndex[tag]: matches.update(groupOfTags, 0) # Check the GeoNames if self.GeoNamesIndex.has_key(tag): for groupOfTags in self.GeoNamesIndex[tag]: matches.update(groupOfTags, 0) # if there are no matches, return 0 if matches.size() == 0: return resCandidates ################################################################### # Count the number of matches between the tags+mtags and the candidates # ################################################################### # Checking how many matches there are in all the GoT for tag in tags: for groupOfTags in matches.getKeys(): gotSplit = groupOfTags.strip().split(" ") # check a partial match inside the entire groupOfTags if matchingType == 'partial': if tag in groupOfTags: matches.update(groupOfTags, 1) # check the match for every tag inside the groupOfTags elif matchingType == 'perfect': for got in gotSplit: # perfect match = exact if tag == got: matches.update(groupOfTags, 1) # break # computing the score for each groups of tags matches.computeScores(self.scoreMetric, len(tags)) # get the scores of the topN matches maxScores = matches.getTopNScores(self.topn) # get the keys of the topnN matches topnKeys = matches.getKeysWithGivenScores(maxScores) if self.verbose: print >> sys.stderr, "matchWithGroupOfTags] we are here, maxScores: %d, topNelements: %d" % ( maxScores[0], len(topnKeys)) ######################################################################## # From all the topN candidates compute the most likelihood coordinates # ######################################################################## # for all the topN matches, return the most likelihood coordinates for got in topnKeys: # check if the key is from TrainTags if self.TrainTagCoord.has_key(got): coord = self.getMostLikelihoodCoordinates( self.TrainTagCoord[got]) # check if the key is from GeoNames elif self.GeoNames.has_key(got): coord = self.getMostLikelihoodCoordinates(self.GeoNames[got]) else: continue # if coord is empty if len(coord) == 0: continue # update the results resCandidates.add(matches.getValue(got), coord, got) # return all the results return resCandidates