Пример #1
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestElementTime()
        begin_time = self.getEarliestElementTime()
        if self._element_type == "photos":
            pi = PhotoInterface()
        else:
            pi = TweetInterface()

        elements = []
        dt = 0
        for day in xrange(1, 8):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_elements = pi.rangeQuery(self._event["region"], [str(bt), str(et)])
            inds = range(0, day_elements.count())
            # only select 40 elements
            if len(inds) > 40:
                random.shuffle(inds)
                inds = inds[0:40]
            for i in inds:
                elements.append(day_elements[i])

        random.shuffle(elements)
        elements = elements[0 : min(len(self._event[self._element_type]), len(elements))]

        if len(elements) == 0:
            # TODO: refine
            return [1, 10, 10]

        # fake a historic event
        historic_event = BaseEvent(self._element_type)
        historic_event.setElements(elements)
        historic_event.setRegion(self._event["region"])
        historic_event.setActualValue(historic_event._getActualValueByCounting())
        historic_event = BaseFeature(historic_event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = historic_event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words

        topic_divergence = self.computeWordKLDivergenceWith(historic_event)

        return [
            historic_event.getElementDisFeatures()[1],
            topic_divergence,
            #               historic_event.getEntropy(entropy_para),
            entropy_divergence,
        ]
Пример #2
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestPhotoTime()
        begin_time = self.getEarliestPhotoTime()

        pi = PhotoInterface()

        photos = []
        dt = 0
        for day in xrange(1, 15):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)])
            for photo in day_photos:
                # since rangeQuery sorts the photos from the most current to the most early
                # thus all the photos in the List "photos" are sorted by their created time from 
                # the most current to the most early
                photos.append(photo)

        random.shuffle(photos)
        photos = photos[0:min(len(self._event['photos']), len(photos))]

        if len(photos) == 0:
            # TODO: refine
            return [1, 10, 10]

        # fake a historic event
        historic_event = Event()
        historic_event.setPhotos(photos)
        historic_event.setRegion(self._event['region'])
        historic_event.setActualValue(historic_event._getActualValueByCounting())
        historic_event = BaseFeature(historic_event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = historic_event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words

        topic_divergence = self.computeWordKLDivergenceWith(historic_event)

        return [historic_event.getPhotoDisFeatures()[3], topic_divergence,
                #               historic_event.getEntropy(entropy_para),
                entropy_divergence]
Пример #3
0
	def getHistoricFeatures(self, entropy_para):
		# this method computes the features that capture the difference between current
		# event and background knowledge
		
		end_time = self.getLatestPhotoTime()
		begin_time = self.getEarliestPhotoTime()
		
		pi = PhotoInterface()
		pi.setDB('citybeat')
		pi.setCollection('photos')
		
		photos = []
		dt = 0
		for day in xrange(1,15):
			# here 15 is hard coded because we use 14 days' data as the training
			et = end_time - day * 24 * 3600 + dt / 2
			bt = begin_time - day * 24 * 3600 - dt / 2
			day_photos = pi.rangeQuery(self._event['region'], [str(bt), str(et)])
			for photo in day_photos:
				# since rangeQuery sorts the photos from the most current to the most early
				# thus all the photos in the List "photos" are sorted by their created time from 
				# the most current to the most early
				photos.append(photo)
				
		random.shuffle(photos)
		photos = photos[0:min(len(self._event['photos']), len(photos))]
		
		# fake a historic event
		historic_event = Event()
		historic_event.setPhotos(photos)
		historic_event.setRegion(self._event['region'])
		historic_event.setActualValue(historic_event._getActualValueByCounting())
		historic_event = EventFeature(historic_event)
		
		# compute the difference between entropy
		# this has been smoothed
		pro1 = self._divideAndCount(entropy_para)
		pro2 = historic_event._divideAndCount(entropy_para)
		entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)
		
		# compute the difference between top words
		
		topic_divergence = self.computeWordKLDivergenceWith(historic_event)
		
		return [historic_event.getPhotoDisFeatures()[3], topic_divergence,
#		        historic_event.getEntropy(entropy_para),
		        entropy_divergence]
Пример #4
0
    def computeWordKLDivergenceWith(self, event):
        if type(event) is types.DictType:
            fake_event = BaseFeature(event)
        else:
            fake_event = event
        event_topword_list = self._getTopWords(-1, True)
        event_topword_list2 = fake_event._getTopWords(-1, True)

        n_ind = 0
        ind = {}
        for word, freq in event_topword_list + event_topword_list2:
            if not ind.has_key(word):
                ind[word] = n_ind
                n_ind += 1
        freq1 = [0] * n_ind
        freq2 = [0] * n_ind
        for word, freq in event_topword_list:
            freq1[ind[word]] = freq
        for word, freq in event_topword_list2:
            freq2[ind[word]] = freq
        topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2)
        return topic_divergence
Пример #5
0
	def computeWordKLDivergenceWith(self, event):
		if type(event) is types.DictType:
			fake_event = EventFeature(event)
		else:
			fake_event = event
		event_topword_list = self._getTopWords(-1, True)
		event_topword_list2 = fake_event._getTopWords(-1, True)
		
		n_ind = 0
		ind = {}
		for word, freq in event_topword_list + event_topword_list2:
			if not ind.has_key(word):
				ind[word] = n_ind
				n_ind += 1
		freq1 = [0] * n_ind
		freq2 = [0] * n_ind
		for word, freq in event_topword_list:
			freq1[ind[word]] = freq
		for word, freq in event_topword_list2:
			freq2[ind[word]] = freq
		topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2)
		return topic_divergence
Пример #6
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestPhotoTime()
        begin_time = self.getEarliestPhotoTime()

        pi = PhotoInterface()
        pi.setDB("citybeat")
        pi.setCollection("photos")

        photos = []
        dt = 3600
        for day in xrange(1, 15):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_photos = pi.rangeQuery(self._event["region"], [str(bt), str(et)])
            for photo in day_photos:
                # since rangeQuery sorts the photos from the most current to the most early
                # thus all the photos in the List "photos" are sorted by their created time from
                # the most current to the most early
                photos.append(photo)

        event = Event()
        event.setPhotos(photos)
        event.setRegion(self._event["region"])
        event.setActualValue(event.getActualValueByCounting())
        event = EventFeature(event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words
        event_topword_list = self._getTopWords(-1, True)
        historic_topword_list = event._getTopWords(-1, True)

        n_ind = 0
        ind = {}
        for word, freq in event_topword_list + historic_topword_list:
            if not ind.has_key(word):
                ind[word] = n_ind
                n_ind += 1

        freq1 = [0] * n_ind
        freq2 = [0] * n_ind

        for word, freq in event_topword_list:
            freq1[ind[word]] = freq
        for word, freq in historic_topword_list:
            freq2[ind[word]] = freq

        topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2)

        return [
            event.getAvgPhotoDis(),
            topic_divergence,
            # 		        event.getEntropy(entropy_para),
            entropy_divergence,
            event.getAvgCaptionLen(),
            event.getRatioOfPeopleToPhoto(),
        ]
Пример #7
0
    def getHistoricFeatures(self, entropy_para):
        # this method computes the features that capture the difference between current
        # event and background knowledge

        end_time = self.getLatestPhotoTime()
        begin_time = self.getEarliestPhotoTime()

        pi = PhotoInterface()
        pi.setDB('citybeat')
        pi.setCollection('photos')

        photos = []
        dt = 3600
        for day in xrange(1, 15):
            # here 15 is hard coded because we use 14 days' data as the training
            et = end_time - day * 24 * 3600 + dt / 2
            bt = begin_time - day * 24 * 3600 - dt / 2
            day_photos = pi.rangeQuery(self._event['region'],
                                       [str(bt), str(et)])
            for photo in day_photos:
                # since rangeQuery sorts the photos from the most current to the most early
                # thus all the photos in the List "photos" are sorted by their created time from
                # the most current to the most early
                photos.append(photo)

        event = Event()
        event.setPhotos(photos)
        event.setRegion(self._event['region'])
        event.setActualValue(event.getActualValueByCounting())
        event = EventFeature(event)

        # compute the difference between entropy
        # this has been smoothed
        pro1 = self._divideAndCount(entropy_para)
        pro2 = event._divideAndCount(entropy_para)
        entropy_divergence = KLDivergence.averageKLDivergence(pro1, pro2)

        # compute the difference between top words
        event_topword_list = self._getTopWords(-1, True)
        historic_topword_list = event._getTopWords(-1, True)

        n_ind = 0
        ind = {}
        for word, freq in event_topword_list + historic_topword_list:
            if not ind.has_key(word):
                ind[word] = n_ind
                n_ind += 1

        freq1 = [0] * n_ind
        freq2 = [0] * n_ind

        for word, freq in event_topword_list:
            freq1[ind[word]] = freq
        for word, freq in historic_topword_list:
            freq2[ind[word]] = freq

        topic_divergence = KLDivergence.averageKLDivergence(freq1, freq2)

        return [
            event.getAvgPhotoDis(),
            topic_divergence,
            #		        event.getEntropy(entropy_para),
            entropy_divergence,
            event.getAvgCaptionLen(),
            event.getRatioOfPeopleToPhoto()
        ]