def multithreadTestVideosParser(self, line, varLock=""): ''' ''' methodUsed = "All Methods" blk = line.strip().split("\t") # get the tags tags = clean_line(blk[3]) mtags = clean_line(blk[4]) print >> sys.stderr, "---" print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags, mtags) # apply tags filters -> mtags contain the geo machine tags # if len(mtags) > 1: # tags += " "+ mtags # give more power to the geo tags # Read Real Location: "45.516639|-122.681053|16|Portland|Oregon|etats-Unis|16" location = blk[5].strip() geo = location.split("|") acc = geo[2] realCoord = [float(geo[0]), float(geo[1])] resCandidates = ResCandidates() # 1st METHOD: check tags in training set if len(tags) > 1: methodUsed = "TrainSet" resCandidates = self.matchWithGroupsOfTags(tags, mtags) else: self.updateLockVariables(methodUsed) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 2nd METHOD: (if there are NOT RESULTS) use the most probable location for this user if resCandidates.size() == 0: methodUsed = "UserCommonLocation" ownGeo = blk[6].split("|") ownId = ownGeo[0] resCandidates = self.getUserMostProbableLocation(ownId) if resCandidates.size() > 0: self.updateLockVariables(methodUsed) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 3rd METHOD: (if there are NOT RESULTS) USE the USER HOMETOWN as tags if resCandidates.size() == 0: methodUsed = "HomeTown" # Owner location: ['14678786@N00', 'milwaukee, United States'] ownGeo = blk[6].split("|") # prepare hometown tags ht = ownGeo[1].split(",") hometownTags = "" for w in ht: if len(w) > 1: hometownTags += w + " " hometownTags = hometownTags[:-1] if (hometownTags) >= 1: # check with the new set of tags resCandidates = self.matchWithGroupsOfTags(hometownTags, "") if resCandidates.size() > 0: self.updateLockVariables(methodUsed) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 4th METHOD: (if there are NOT RESULTS) define zero position if resCandidates.size() == 0: methodUsed = "DefaultCoordinates" self.updateLockVariables(methodUsed) hometown = False resCandidates.add(-1.0, self.defaultCoordinates, "no results") # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] defined default coordinates" # return the bigger coordinates group if resCandidates.size() > 2: # print out info if verbose: print >> sys.stderr, "*> parseTestVideos] BiggestCoordinatesGroup: from %d results got 1" % resCandidates.size( ) candies = [] for res in resCandidates.getList(): candies.append( str(res.getScore()) + "," + str(res.getCoord()[0]) + "|" + str(res.getCoord()[1])) resCandidates = self.computeBiggestCoordinatesFromResults( resCandidates) # print out info if self.verbose: for can in candies: print >> sys.stderr, can print >> sys.stderr, "*> parseTestVideos] FINAL => ", resCandidates.getList( )[0].getCoord() # write the buffer into a file self.serialize(resCandidates, realCoord, methodUsed) print >> sys.stderr, "---\n"
def multithreadTestVideosParser(self, line, varLock=""): ''' ''' methodUsed = "All Methods" blk = line.strip().split("\t") # get the tags tags = clean_line( blk[3] ) mtags = clean_line( blk[4] ) print >> sys.stderr, "---" print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags, mtags) # apply tags filters -> mtags contain the geo machine tags # if len(mtags) > 1: # tags += " "+ mtags # give more power to the geo tags # Read Real Location: "45.516639|-122.681053|16|Portland|Oregon|etats-Unis|16" location = blk[5].strip() geo = location.split("|") acc = geo[2] realCoord = [float(geo[0]), float(geo[1])] resCandidates = ResCandidates() # 1st METHOD: check tags in training set if len(tags) > 1: methodUsed = "TrainSet" resCandidates = self.matchWithGroupsOfTags( tags, mtags ) else: self.updateLockVariables( methodUsed ) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(), methodUsed # 2nd METHOD: (if there are NOT RESULTS) use the most probable location for this user if resCandidates.size() == 0: methodUsed = "UserCommonLocation" ownGeo = blk[6].split("|") ownId = ownGeo[0] resCandidates = self.getUserMostProbableLocation( ownId ) if resCandidates.size() > 0: self.updateLockVariables( methodUsed ) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(), methodUsed # 3rd METHOD: (if there are NOT RESULTS) USE the USER HOMETOWN as tags if resCandidates.size() == 0: methodUsed = "HomeTown" # Owner location: ['14678786@N00', 'milwaukee, United States'] ownGeo = blk[6].split("|") # prepare hometown tags ht = ownGeo[1].split(",") hometownTags = "" for w in ht: if len(w) > 1: hometownTags += w +" " hometownTags = hometownTags[:-1] if (hometownTags) >= 1: # check with the new set of tags resCandidates = self.matchWithGroupsOfTags( hometownTags, "" ) if resCandidates.size() > 0: self.updateLockVariables( methodUsed ) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(), methodUsed # 4th METHOD: (if there are NOT RESULTS) define zero position if resCandidates.size() == 0: methodUsed = "DefaultCoordinates" self.updateLockVariables( methodUsed ) hometown = False resCandidates.add( -1.0, self.defaultCoordinates, "no results" ) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] defined default coordinates" # return the bigger coordinates group if resCandidates.size() > 2: # print out info if verbose: print >> sys.stderr, "*> parseTestVideos] BiggestCoordinatesGroup: from %d results got 1" % resCandidates.size() candies = [] for res in resCandidates.getList(): candies.append( str(res.getScore()) +","+ str(res.getCoord()[0]) +"|"+ str(res.getCoord()[1]) ) resCandidates = self.computeBiggestCoordinatesFromResults( resCandidates ) # print out info if self.verbose: for can in candies: print >> sys.stderr, can print >> sys.stderr, "*> parseTestVideos] FINAL => ", resCandidates.getList()[0].getCoord() # write the buffer into a file self.serialize( resCandidates, realCoord, methodUsed ) print >> sys.stderr, "---\n"
def parseTestVideos(self, withMtags=True, officialRun=False): ''' Given the TestSet file, read and parse each video meta-data, select the tags and retrieve the most suitable places for those tags ''' print >> sys.stderr, "*> [SelectCoordMethod: %s] [ScoreMetric: %s] [MatchingType: %s]" % ( self.selectCoordMethod, self.scoreMetric, self.matchingType) print >> sys.stderr, "*> [TestSet]", t1 = time.time() userMostProbableLocation = 0 userHomeCoordinates = 0 nullCoordinates = 0 contentbased = 0 noTags = 0 totLines = 4532.0 lines = 0 # videoId <t> title <t> url <t> tags <t> mtags <t> location <t> ownerLocation for line in self.TestFile: lines += 1 print >> sys.stderr, "---" methodUsed = "All Methods" ################ # get filename and url blk = line.strip().split("\t") url = blk[2].strip() filename = blk[1].split("|")[0] if '.jpg' in filename: filename.replace('.jpg', '') ################ # get the tags, mtags, geo and ownGeo tags, mtags, geo, ownGeo = self.getInfoFromTestVideo( line, withMtags) # print info # if type(tags) == 'unicode': # print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags.encode('utf-8','ignore'), mtags.encode('utf-8','ignore')) # else: # print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags, mtags) realCoord = [float(geo[0]), float(geo[1]) ] if not officialRun else "" resCandidates = ResCandidates() # 1st METHOD: check tags in training set if len(tags) > 1: methodUsed = "TrainSet" resCandidates = self.matchWithGroupsOfTags(tags, mtags) else: noTags += 1 # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 2nd METHOD: (if there are NOT RESULTS) use the most probable location for this user if resCandidates.size() == 0: methodUsed = "UserCommonLocation" ownId = ownGeo[0] resCandidates = self.getUserMostProbableLocation(ownId.lower()) if resCandidates.size() > 0: print >> sys.stderr, "*> Used UserCommonLocation for user %s" % ownId userMostProbableLocation += 1 else: print >> sys.stderr, "*> NO UserCommonLocation for user %s" % ownId # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 3rd METHOD: (if there are NOT RESULTS) USE the USER HOMETOWN as tags if resCandidates.size() == 0: methodUsed = "HomeTown" # prepare hometown tags ht = ownGeo[1].split(",") hometownTags = "" for w in ht: if len(w) > 1: hometownTags += w + " " hometownTags = hometownTags[:-1] if (hometownTags) < 2: continue # check with the new set of tags resCandidates = self.matchWithGroupsOfTags(hometownTags, "") if resCandidates.size() > 0: userHomeCoordinates += 1 # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size( ), methodUsed # 4th METHOD: Check Content-Video results if resCandidates.size() == 0: resCandidates = self.getContentVideoResults(filename) if resCandidates.size() > 0: print >> sys.stderr, "*> Used Content-Based Approach" contentbased += 1 else: print >> sys.stderr, "*> NO Content-Based Information" # 4th METHOD: (if there are NOT RESULTS) define zero position if resCandidates.size() == 0: methodUsed = "DefaultCoordinates" nullCoordinates += 1 hometown = False resCandidates.add(-1.0, self.defaultCoordinates, "no results") # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] defined default coordinates" # return the bigger coordinates group if resCandidates.size() > 2: # print out info if verbose: print >> sys.stderr, "*> parseTestVideos] BiggestCoordinatesGroup: from %d results got 1" % resCandidates.size( ) candies = [] for res in resCandidates.getList(): candies.append( str(res.getScore()) + "," + str(res.getCoord()[0]) + "|" + str(res.getCoord()[1])) resCandidates = self.computeBiggestCoordinatesFromResults( resCandidates) # print out info if self.verbose: for can in candies: print >> sys.stderr, can print >> sys.stderr, "*> parseTestVideos] FINAL => ", resCandidates.getList( )[0].getCoord() # write the buffer into a file if not officialRun: self.serialize(resCandidates, realCoord, methodUsed) self.serializeFormatted(resCandidates, filename, url) else: self.serializeFormatted(resCandidates, filename, url) # print info if (lines % 100) == 0: print >> sys.stderr, "\r*> [TestSet] %2.2f%s parsed [%d usrLoc, %d usrHome, %d content-based, %d defCoordinates, %d noTags]" % ( (lines / totLines * 100), '%', userMostProbableLocation, userHomeCoordinates, contentbased, nullCoordinates, noTags), # final info t2 = time.time() print >> sys.stderr, "\r*> [TestSet] %d videos [%d userLocation, %d hometown, %d content-based, %d defaultCoordinates, %d with no tags] ~%.2fs" % ( lines, userMostProbableLocation, userHomeCoordinates, contentbased, nullCoordinates, noTags, (t2 - t1) * 1.0) # compute the average of the statistics print >> sys.stderr, "---" if not officialRun: for k in self.limits: p = float(self.stats[k]) / lines * 100 print >> sys.stderr, "*> [TestSet] %d videos (%.2f%s) inside a radius of %dkm" % ( self.stats[k], p, '%', k) # close test file self.TestFile.close()
def parseTestVideos(self, withMtags=True, officialRun=False ): ''' Given the TestSet file, read and parse each video meta-data, select the tags and retrieve the most suitable places for those tags ''' print >> sys.stderr, "*> [SelectCoordMethod: %s] [ScoreMetric: %s] [MatchingType: %s]" % ( self.selectCoordMethod, self.scoreMetric, self.matchingType ) print >> sys.stderr, "*> [TestSet]", t1 = time.time() userMostProbableLocation = 0 userHomeCoordinates = 0 nullCoordinates = 0 contentbased = 0 noTags = 0 totLines = 4532.0 lines = 0 # videoId <t> title <t> url <t> tags <t> mtags <t> location <t> ownerLocation for line in self.TestFile: lines += 1 print >> sys.stderr, "---" methodUsed = "All Methods" ################ # get filename and url blk = line.strip().split("\t") url = blk[2].strip() filename = blk[1].split("|")[0] if '.jpg' in filename: filename.replace('.jpg', '') ################ # get the tags, mtags, geo and ownGeo tags, mtags, geo, ownGeo = self.getInfoFromTestVideo( line, withMtags ) # print info # if type(tags) == 'unicode': # print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags.encode('utf-8','ignore'), mtags.encode('utf-8','ignore')) # else: # print >> sys.stderr, "* input tags: %s [mtags: %s]" % (tags, mtags) realCoord = [float(geo[0]), float(geo[1])] if not officialRun else "" resCandidates = ResCandidates() # 1st METHOD: check tags in training set if len(tags) > 1: methodUsed = "TrainSet" resCandidates = self.matchWithGroupsOfTags( tags, mtags ) else: noTags += 1 # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(), methodUsed # 2nd METHOD: (if there are NOT RESULTS) use the most probable location for this user if resCandidates.size() == 0: methodUsed = "UserCommonLocation" ownId = ownGeo[0] resCandidates = self.getUserMostProbableLocation( ownId.lower() ) if resCandidates.size() > 0: print >> sys.stderr, "*> Used UserCommonLocation for user %s" % ownId userMostProbableLocation += 1 else: print >> sys.stderr, "*> NO UserCommonLocation for user %s" % ownId # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(), methodUsed # 3rd METHOD: (if there are NOT RESULTS) USE the USER HOMETOWN as tags if resCandidates.size() == 0: methodUsed = "HomeTown" # prepare hometown tags ht = ownGeo[1].split(",") hometownTags = "" for w in ht: if len(w) > 1: hometownTags += w +" " hometownTags = hometownTags[:-1] if (hometownTags) < 2: continue # check with the new set of tags resCandidates = self.matchWithGroupsOfTags( hometownTags, "" ) if resCandidates.size() > 0: userHomeCoordinates += 1 # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] got %d results with" % resCandidates.size(), methodUsed # 4th METHOD: Check Content-Video results if resCandidates.size() == 0: resCandidates = self.getContentVideoResults( filename ) if resCandidates.size() > 0: print >> sys.stderr, "*> Used Content-Based Approach" contentbased += 1 else: print >> sys.stderr, "*> NO Content-Based Information" # 4th METHOD: (if there are NOT RESULTS) define zero position if resCandidates.size() == 0: methodUsed = "DefaultCoordinates" nullCoordinates += 1 hometown = False resCandidates.add( -1.0, self.defaultCoordinates, "no results" ) # print out info if self.verbose: print >> sys.stderr, "*> parseTestVideos] defined default coordinates" # return the bigger coordinates group if resCandidates.size() > 2: # print out info if verbose: print >> sys.stderr, "*> parseTestVideos] BiggestCoordinatesGroup: from %d results got 1" % resCandidates.size() candies = [] for res in resCandidates.getList(): candies.append( str(res.getScore()) +","+ str(res.getCoord()[0]) +"|"+ str(res.getCoord()[1]) ) resCandidates = self.computeBiggestCoordinatesFromResults( resCandidates ) # print out info if self.verbose: for can in candies: print >> sys.stderr, can print >> sys.stderr, "*> parseTestVideos] FINAL => ", resCandidates.getList()[0].getCoord() # write the buffer into a file if not officialRun: self.serialize( resCandidates, realCoord, methodUsed ) self.serializeFormatted( resCandidates, filename, url ) else: self.serializeFormatted( resCandidates, filename, url ) # print info if (lines % 100) == 0: print >> sys.stderr, "\r*> [TestSet] %2.2f%s parsed [%d usrLoc, %d usrHome, %d content-based, %d defCoordinates, %d noTags]" % ( (lines/totLines*100), '%', userMostProbableLocation, userHomeCoordinates, contentbased, nullCoordinates, noTags ), # final info t2 = time.time() print >> sys.stderr, "\r*> [TestSet] %d videos [%d userLocation, %d hometown, %d content-based, %d defaultCoordinates, %d with no tags] ~%.2fs" % ( lines, userMostProbableLocation, userHomeCoordinates, contentbased, nullCoordinates, noTags, (t2-t1)*1.0 ) # compute the average of the statistics print >> sys.stderr, "---" if not officialRun: for k in self.limits: p = float(self.stats[k])/lines*100 print >> sys.stderr, "*> [TestSet] %d videos (%.2f%s) inside a radius of %dkm" % (self.stats[k], p, '%', k) # close test file self.TestFile.close()