def removeDuplicatePhotos(self): # this method is not good, just for tempory use # by judging if the caption is duplicate new_photos = [] num_duplicate = 0 for photo in self._event['photos']: p = Photo(photo) is_duplicate = False cap1 = p.getCaption() user1 = p.getUserName() for new_photo in new_photos: p2 = Photo(new_photo) cap2 = p2.getCaption() user2 = p2.getUserName() if user1 == user2 and (len(cap1) > 0 and cap1 == cap2): is_duplicate = True num_duplicate += 1 break if not is_duplicate: new_photos.append(photo) if num_duplicate > 0: self._event['photos'] = new_photos return num_duplicate
def removeDuplicatePhotos(self): # this method is not good, just for tempory use # by judging if the caption is duplicate new_photos = [] num_duplicate = 0 for photo in self._event['photos']: p = Photo(photo) is_duplicate = False cap1 = p.getCaption() user1 = p.getUserName() for new_photo in new_photos: p2 = Photo(new_photo) cap2 = p2.getCaption() user2 = p2.getUserName() if user1 == user2 and (len(cap1)>0 and cap1 == cap2): is_duplicate = True num_duplicate += 1 break if not is_duplicate: new_photos.append(photo) if num_duplicate > 0: self._event['photos'] = new_photos return num_duplicate
def getWordList(self, event): # word_list is a list of (word, freq) cp = CaptionParser(True) for photo in event['photos']: photo = Photo(photo) cp.insertCaption(photo.getCaption()) return cp.getTopWords(-1, False)
def _getTopWords(self, k, stopword_removal=False): caption_parser = CaptionParser(stopword_removal=stopword_removal) for photo in self._event['photos']: p = Photo(photo) caption = p.getCaption() if not caption is None: caption_parser.insertCaption(caption) return caption_parser.getTopWords(k)
def _getTopWords(self, k, stopword_removal=False): caption_parser = CaptionParser(stopword_removal=stopword_removal) for photo in self._event["photos"]: p = Photo(photo) caption = p.getCaption() if not caption is None: caption_parser.insertCaption(caption) return caption_parser.getTopWords(k)
def getCaptionPercentage(self): cap_number = 0 photos = self._event['photos'] for photo in photos: photo = Photo(photo) cap_len = len(photo.getCaption()) if cap_len > 0: cap_number += 1 return cap_number * 1.0 / len(photos)
def getCaptionPercentage(self): cap_number = 0 photos = self._event["photos"] for photo in photos: photo = Photo(photo) cap_len = len(photo.getCaption()) if cap_len > 0: cap_number += 1 return cap_number * 1.0 / len(photos)
def _getTopWords(self, k, stopword_removal=False): # get top words by counting the frequecy text_parser = TextParser(stopword_removal=stopword_removal) for photo in self._event['photos']: p = Photo(photo) caption = p.getCaption() if not caption is None: text_parser.insertCaption(caption) return text_parser.getTopWords(k)
def computeWordKLDivergenceWithByEddie(self, event): # this method calls the kl divergence computation by eddie's methods text1 = '' text2 = '' for photo in self._event['photos']: p = Photo(photo) text1 += ' ' text1 += p.getCaption() if type(event) is types.DictType: pass else: event = event.toJSON() for photo in event['photos']: p = Photo(photo) text2 += ' ' text2 += p.getCaption() return kldiv(tokenize(text1), tokenize(text2))
def computeWordKLDivergenceWithByEddie(self, event): # this method calls the kl divergence computation by eddie's methods text1 = '' text2 = '' for photo in self._event['photos']: p = Photo(photo) text1 += ' ' text1 += p.getCaption() if type(event) is types.DictType: pass else: event = event.toDict() for photo in event['photos']: p = Photo(photo) text2 += ' ' text2 += p.getCaption() return kldiv(tokenize(text1), tokenize(text2))
def PhotoDistanceByCaption(photo1, photo2): p1 = Photo(photo1) p2 = Photo(photo2) cap1 = p1.getCaption() cap2 = p2.getCaption() cp1 = TextParser(True) cp1.insertCaption(cap1) cp2 = TextParser(True) cp2.insertCaption(cap2) word_list1 = cp1.getTopWords(-1) word_list2 = cp2.getTopWords(-1) if len(word_list1) == 0 or len(word_list2) == 0: # unable to compare return None word_dict1 = {} for word, freq in word_list1: word_dict1[word] = freq word_dict2 = {} for word, freq in word_list2: word_dict2[word] = freq return kldiv(word_dict1, word_dict2)
def PhotoDistanceByCaption(photo1, photo2): p1 = Photo(photo1) p2 = Photo(photo2) cap1 = p1.getCaption() cap2 = p2.getCaption() cp1 = CaptionParser(True) cp1.insertCaption(cap1) cp2 = CaptionParser(True) cp2.insertCaption(cap2) word_list1 = cp1.getTopWords(-1) word_list2 = cp2.getTopWords(-1) if len(word_list1) == 0 or len(word_list2) == 0: # unable to compare return None word_dict1 = {} for word, freq in word_list1: word_dict1[word] = freq word_dict2 ={} for word, freq in word_list2: word_dict2[word] = freq return kldiv(word_dict1, word_dict2)
def getAvgCaptionLen(self): cap_number = 0 cap_lens = 0 photos = self._event['photos'] for photo in photos: photo = Photo(photo) cap_len = len(photo.getCaption()) if cap_len > 0: cap_lens += cap_len cap_number += 1 if cap_number == 0: return -1 else: return 1.0 * cap_lens / cap_number
def getAvgCaptionLen(self): cap_number = 0 cap_lens = 0 photos = self._event["photos"] for photo in photos: photo = Photo(photo) cap_len = len(photo.getCaption()) if cap_len > 0: cap_lens += cap_len cap_number += 1 if cap_number == 0: return -1 else: return 1.0 * cap_lens / cap_number
def countHashtagsFromPhotosContainingTopKeywords(self, k=3): # count the number of hashtags of photos that associated with topwords # k is the number of top keywords # rank top keywords by counting their frequency word_photo_list = self.getTopKeywordsAndPhotos(k, 10000) cnt = [0] * k cnt2 = [0] * k for i in xrange(0, len(word_photo_list)): j = 0 for photo in word_photo_list[i][2]: p = Photo(photo) cap = p.getCaption() j += 1 cnt[i] += cap.count('#') # return the number of hashtags cnt[i] = cnt[i] * 1.0 / j # reteurn the number of photos cnt2[i] = len(word_photo_list[i][2]) return [cnt, cnt2]
def countHashtagsFromPhotosContainingTopKeywords(self, k=3): # count the number of hashtags of photos that associated with topwords # k is the number of top keywords # rank top keywords by counting their frequency word_photo_list = self.getTopKeywordsAndPhotos(k, 10000) cnt = [0]*k cnt2 = [0]*k for i in xrange(0, len(word_photo_list)): j = 0 for photo in word_photo_list[i][2]: p = Photo(photo) cap = p.getCaption() j += 1 cnt[i] += cap.count('#') # return the number of hashtags cnt[i] = cnt[i] * 1.0 / j # reteurn the number of photos cnt2[i] = len(word_photo_list[i][2]) return [cnt, cnt2]
from photo_interface import PhotoInterface from caption_parser import CaptionParser from photo import Photo from mongodb_interface import MongoDBInterface import random if __name__ == '__main__': pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') photos = pi.getAllDocuments() for photo in photos: i = random.randint(0, 10) if i > 0: continue p = Photo(photo) cap = p.getCaption() if len(cap) > 0: cap = {'caption': cap} mi.saveDocument(cap)
from photo_interface import PhotoInterface from caption_parser import CaptionParser from photo import Photo from mongodb_interface import MongoDBInterface import random if __name__ == '__main__': pi = PhotoInterface() pi.setDB('citybeat') pi.setCollection('photos') mi = MongoDBInterface() mi.setDB('test_caption') mi.setCollection('captions') photos = pi.getAllDocuments() for photo in photos: i = random.randint(0,10) if i > 0: continue p = Photo(photo) cap = p.getCaption() if len(cap) > 0: cap = {'caption':cap} mi.saveDocument(cap)