Пример #1
0
    def goThroughCandidateDB(self):
        """Go through candidate event db and classify whatever is left"""
        ei = EventInterface(self.candidate_db, self.candidate_collection)
        ei_classified = EventInterface(self.classified_event_db, self.classified_event_collection)
        cnt = 0
        # consider past 2 hours for merge
        low_bound = str(int(getCurrentStampUTC()) - 60 * 60 * 2)
        condition = {'created_time':{ '$gte':  low_bound}}
        for e in ei.getAllDocuments(condition=condition):
            logging.warning("Classifying %d-th candidate event..." % cnt)
            e = Event(e)
            cnt += 1
            region = Region(e.getRegion())
            corpus = self.all_corpus[region.getKey()]
            ef = BaseFeatureProduction(e, corpus)
            prob = self.clf.classify(ef.extractFeatures())

            if ei_classified.getEventByID(e.getID()) is not None:
                if prob > 0.5:
                    print 'already in front end collection, merge it'
                    ei_classified.addEvent(e)
                else:
                    print 'after merge it becomes none event, delete it'
                    ei_classified.deleteEventByID(e.getID())
            else:
                if prob > 0.5:
                    print 'new events find in collection but not in front end , add it'
                    ei_classified.addEvent(e)
def findLast24HourEvents():
    ei = EventInterface()
    ei.setCollection(InstagramConfig.front_end_events)

    now = int(getCurrentStampUTC())
    # for merge reason, delay one hour
    offset = 60 * 60
    end_time = now - offset
    begin_time = end_time - 24 * 3600

    conditions = {'created_time':{'$gte':str(begin_time), '$lte':str(end_time)}}
    fields = ['_id']
    cur = ei.getAllFields(fields=fields, condition=conditions)

    event_count = 0
    with open(csv_file, 'wb') as csvfile:
        event_writer = csv.writer(csvfile, delimiter=',')
        events = []
        for event in cur:
            url = 'http://ec2-23-22-67-45.compute-1.amazonaws.com/cb/event/' + str(event['_id'])
            events.append([url])
            event_count += 1
        event_writer.writerows(events)

    return event_count
Пример #3
0
    def fireAlarm(self):
        self._getFiftenMiniutesPhotos() #get current_value
        cur_hour = datetime.utcfromtimestamp(float(self.cur_time)).hour
        #print 'cur_hour = ',cur_hour, 'time = ',self.cur_time 
        mu = self.means[cur_hour]/4.0
        std = self.stds[cur_hour]/4.0
        #print 'mu is ',mu, 'std is ',std, 'cur_value = ',self.current_value
        zscore = (self.current_value - mu)*1.0/std

        if zscore > 3 and self.current_value>=8:
            e = Event()
            e.setPredictedValues(mu, std)
            e.setZscore(zscore)
            e.setRegion(self.region)
            e.setCreatedTime(self.cur_time)
            e.setActualValue(self.current_value)

            for p in self.photos:
                e.addPhoto(p)
        
            ei = EventInterface( )
            ei.setCollection(self.candidate_collection)
            #print datetime.utcfromtimestamp(float(e.getEarliestPhotoTime())), datetime.utcfromtimestamp(float(e.getLatestPhotoTime()))
            #print e.getEarliestPhotoTime(),e.getLatestPhotoTime()
            #print e.toJSON()['region']
            ei.addEvent(e)
Пример #4
0
    def fireAlarm(self):
        prediction = self.getNearestPrediction()

        self._getFiftenMiniutesData()
        if prediction is None:
            print 'No prediction'
            return
        else:
            print 'Data!'
        mu = float(prediction['mu']) / 4.0
        std = float(prediction['std']) / 4.0
        time_stamp = prediction['time']
        zscore = (self.current_value - mu) * 1.0 / std
        print 'cur value = ', self.current_value, 'zscore = ', zscore
        if zscore > 3.0 and self.current_value > 5:   #comment this
            print 'in alarm!, cur value = ', self.current_value
            if self.data_source == 'twitter':
                e = TweetEvent()
                for dt in self.data:
                    e.addTweet(dt)
            elif self.data_source == 'instagram':
                e = PhotoEvent()
                for dt in self.data:
                    e.addPhoto(dt)

            e.setPredictedValues(mu, std)
            e.setZscore(zscore)
            e.setRegion(self.region)
            e.setCreatedTime(self.cur_time)
            e.setActualValue(self.current_value)

            ei = EventInterface()
            ei.setCollection(self.candidate_collection)
            print e.getEarliestPhotoTime(), e.getLatestPhotoTime()
            print ei.addEvent(e)
Пример #5
0
    def fireAlarm(self):
        prediction = self.getNearestPrediction()

        self._getFiftenMiniutesPhotos()
        if prediction is None:
            print "None data for this region: details as follow"
            self.region.display()
            print "time:", self.cur_time
            return
        mu = float(prediction["mu"]) / 4.0
        std = float(prediction["std"]) / 4.0
        time_stamp = prediction["time"]

        zscore = (self.current_value - mu) * 1.0 / std

        if zscore > 3:
            e = Event()
            e.setPredictedValues(mu, std)
            e.setZscore(zscore)
            e.setRegion(self.region)
            e.setCreatedTime(self.cur_time)
            e.setActualValue(self.current_value)

            for p in self.photos:
                e.addPhoto(p)
            # print 'current value ',4.0*self.current_value, ' predict = ',mu*4.0,' std = ',std*4.0

            ei = EventInterface()
            ei.setCollection(self.candidate_collection)
            print e.getEarliestPhotoTime(), e.getLatestPhotoTime()
            # print e.toJSON()['region']
            # ei.addEvent(e)
            ei.addEventWithoutMerge(e)
Пример #6
0
    def __init__(self):
        self.ei = EventInterface()
        self.ei.setDB('AmazonMT')
        self.ei.setCollection('candidate_event_25by25_merged')
        self.representor = Representor()
        #self.ei.setDB('citybeat')
        #self.ei.setCollection('next_week_candidate_event_25by25_merged')
        #self.ei.setCollection('online_candidate')

        self._loadCrowdFlowerCode()
Пример #7
0
 def __init__(self):
     self.ei = EventInterface()
     self.ei.setDB(InstagramConfig.event_db)
     #self.representor = Representor(db='citybeat_production', collection='instagram_front_end_events')
     
     self.ei.setCollection(InstagramConfig.front_end_events)
     self.stats_interface = StatsInterface()
Пример #8
0
 def __init__(self):
     self.ei = EventInterface()
     self.ei.setDB('AmazonMT')
     self.ei.setCollection('candidate_event_25by25_merged')
     self.representor = Representor()
     #self.ei.setDB('citybeat')
     #self.ei.setCollection('next_week_candidate_event_25by25_merged')
     #self.ei.setCollection('online_candidate')
     
     self._loadCrowdFlowerCode()
Пример #9
0
    def __init__(self):
        self.ei = EventInterface()
        #self.ei.setDB('AmazonMT')
        #self.ei.setCollection('candidate_event_25by25_merged')

        self.ei.setDB('citybeat')
        #self.ei.setCollection('next_week_candidate_event_25by25_merged')
        self.ei.setCollection('online_candidate')

        #collection = 'candidate_event_25by25_merged'
        #self.c = Corpus()
        #self.c.buildCorpusOnDB('AmazonMT', collection)

        #collection = 'candidate_event_25by25_merged'
        #self.c = Corpus()
        #self.c.buildCorpusOnDB('citybeat', 'online_candidate')

        self._loadCrowdFlowerCode()

        self.cache_events = {}
        self.cache_photos = {}
Пример #10
0
    def fireAlarm(self):
        prediction = self.getNearestPrediction()

        self._getFiftenMiniutesPhotos()
        if prediction is None:
            print 'None data for this region: details as follow'
            self.region.display()
            print 'time:', self.cur_time
            return
        mu = float(prediction['mu']) / 4.0
        std = float(prediction['std']) / 4.0
        time_stamp = prediction['time']

        zscore = (self.current_value - mu) * 1.0 / std

        if zscore > 3:
            e = Event()
            e.setPredictedValues(mu, std)
            e.setZscore(zscore)
            e.setRegion(self.region)
            e.setCreatedTime(self.cur_time)
            e.setActualValue(self.current_value)

            for p in self.photos:
                e.addPhoto(p)
            #print 'current value ',4.0*self.current_value, ' predict = ',mu*4.0,' std = ',std*4.0

            ei = EventInterface()
            ei.setCollection(self.candidate_collection)
            print e.getEarliestPhotoTime(), e.getLatestPhotoTime()
            #print e.toJSON()['region']
            #ei.addEvent(e)
            ei.addEventWithoutMerge(e)
Пример #11
0
    def fireAlarm(self):
        self._getFiftenMiniutesPhotos()  #get current_value
        cur_hour = datetime.utcfromtimestamp(float(self.cur_time)).hour
        #print 'cur_hour = ',cur_hour, 'time = ',self.cur_time
        mu = self.means[cur_hour] / 4.0
        std = self.stds[cur_hour] / 4.0
        #print 'mu is ',mu, 'std is ',std, 'cur_value = ',self.current_value
        zscore = (self.current_value - mu) * 1.0 / std

        if zscore > 3 and self.current_value >= 8:
            e = Event()
            e.setPredictedValues(mu, std)
            e.setZscore(zscore)
            e.setRegion(self.region)
            e.setCreatedTime(self.cur_time)
            e.setActualValue(self.current_value)

            for p in self.photos:
                e.addPhoto(p)

            ei = EventInterface()
            ei.setCollection(self.candidate_collection)
            #print datetime.utcfromtimestamp(float(e.getEarliestPhotoTime())), datetime.utcfromtimestamp(float(e.getLatestPhotoTime()))
            #print e.getEarliestPhotoTime(),e.getLatestPhotoTime()
            #print e.toJSON()['region']
            ei.addEvent(e)
Пример #12
0
    def __init__(self):
        self.ei = EventInterface()
        #self.ei.setDB('AmazonMT')
        #self.ei.setCollection('candidate_event_25by25_merged')
        
        #self.ei.setDB('citybeat')
        #self.ei.setCollection('baseline_candidate_events')

        self.ei.setDB('citybeat_production')

        #self.ei.setCollection('next_week_candidate_event_25by25_merged')
        #self.representor = Representor(db='citybeat', collection='next_week_candidate_event_25by25_merged')
        
        #print 'Building representor'
        #self.representor = Representor(db='citybeat_production', collection='instagram_front_end_events')
        
        print 'Building done'
        self.ei.setCollection('instagram_front_end_events')
        #self.ei.setCollection('online_candidate_instagram')
        
        self._loadCrowdFlowerCode()
Пример #13
0
    def __init__(self):
        self.ei = EventInterface()
        #self.ei.setDB('AmazonMT')
        #self.ei.setCollection('candidate_event_25by25_merged')

        self.ei.setDB('citybeat')
        #self.ei.setCollection('next_week_candidate_event_25by25_merged')
        self.ei.setCollection('online_candidate')

        #collection = 'candidate_event_25by25_merged'
        #self.c = Corpus()
        #self.c.buildCorpusOnDB('AmazonMT', collection)
        
        #collection = 'candidate_event_25by25_merged'
        #self.c = Corpus()
        #self.c.buildCorpusOnDB('citybeat', 'online_candidate')
        
        self._loadCrowdFlowerCode()

        self.cache_events = {}
        self.cache_photos = {}
Пример #14
0
class Root:
    def __init__(self):
        self.ei = EventInterface()
        self.ei.setDB('AmazonMT')
        self.ei.setCollection('candidate_event_25by25_merged')
        self.representor = Representor()
        #self.ei.setDB('citybeat')
        #self.ei.setCollection('next_week_candidate_event_25by25_merged')
        #self.ei.setCollection('online_candidate')
        
        self._loadCrowdFlowerCode()


    def getAllEvents(self):
        event_cursor = self.ei.getAllDocuments()
        events = []
        tmp_events = [e for e in event_cursor]
        for e in tmp_events:
            if len(e['photos'])>3:
                if random.random()>0.1:
                    e['_id'] = str(e['_id'])
                    e['urgency'] = 58
                    e['volume'] = 99
                    e['stats'] = {'photos':50, 'tweets':0, 'checkins':0}
                    rep_photos = self.representor.getRepresentivePhotos(e)
                    e['photos'] = rep_photos[:min(5,len(rep_photos))]
                    events.append(e)
        return json.dumps(events)
    getAllEvents.exposed = True 
    
    def _loadCrowdFlowerCode(self):
        lines = open('crowdflower_code.txt').readlines()
        self.cf_code = {}
        for line in lines:
            t = line.split(',')
            self.cf_code[t[0]] = t[1]

    def getCrowdFlowerCode(self, event_id):
        if event_id in self.cf_code:
            return self.cf_code[event_id]
        else:
            return None
    getCrowdFlowerCode.exposed = True

    def getAllEventsIDs(self):
        object_ids = self.ei.getAllDocumentIDs()
        return_value = []
        for _id in object_ids:
            return_value.append( str(_id) )
        return json.dumps( return_value )
    #getAllEventsIDs.exposed = True
    
    def _deleteExtraMeta(self,photo):
        try: del photo['comments']
        except Exception as e: pass

        try: del photo['caption']['from']
        except Exception as e: pass
        try: del photo['filter']
        except Exception as e: pass
        try: del photo['user']
        except Exception as e: pass
        try: del photo['images']['standard_resolution']
        except Exception as e: pass
        try: del photo['images']['low_resolution']
        except Exception as e: pass
        try: del photo['likes']
        except Exception as e: pass
        try: del photo['likes']
        except Exception as e: pass
        return photo

    def getPhotosByID(self, event_id):
        event = json.loads(self.getEventByID(event_id))
        #event = EventFrontend(event, self.c)
            
        #top_words_list = event.getTopKeywordsAndPhotos(20,5)
        #words_pics_list = event.getTopKeywordsAndPhotosByTFIDF(20, 5)
        #keywords_shown = set()
        
        res = []

        all_photos = []
        top10_photos = []
        all_photos.append('all_photos')
        #print event['photos']
        all_photos.append(len(event['photos']))
        all_photos.append( event['photos'])

        rep_photos = self.representor.getRepresentivePhotos(event)
        rep_photos = rep_photos[:10]
        top10_photos.append('top_10_representative')
        top10_photos.append(min(10, len(rep_photos)))
        top10_photos.append(rep_photos)
   
        res.append(all_photos)
        res.append(top10_photos)
        """
        for tf, idf in zip(top_words_list,words_pics_list):
            if tf[0] not in keywords_shown:
                keywords_shown.add(tf[0])
                res.append(tf)
            if idf[0] not in keywords_shown:
                keywords_shown.add(idf[0])
                res.append(idf)
        """ 
        r = json.dumps(res) 
        #print r
        #r = json.dumps(words_pics_list + top_words_list)
        return r
    getPhotosByID.exposed = True
   
    def _cacheAll(self):
        print 'begin cache'
        all_events = self.getAllEvents()
        print type(all_events)
        all_events = json.loads(all_events)
        cnt = 0
        for e in all_events:
            cnt+=1
            if cnt%100 == 0:
                print cnt
            self.cache_events[e['_id']] = json.dumps(e)
        for e in all_events:
            cnt+=1
            if cnt%100 == 0:
                print cnt
            self.cache_photos[e['_id']] = self.getPhotosByID(e['_id'])
          

    def getEventByID(self, event_id):
        event = self.ei.getEventByID(event_id)
        event = Event(event)
        event.selectOnePhotoForOneUser()
        event_dic = event.toJSON()
        event_dic['_id'] = str(event_dic['_id'])
        return json.dumps(event_dic)
    getEventByID.exposed = True
    
    def getTopKeywords(self, event_id):
        event = self.ei.getEventByID(event_id)
        ef = EventFeature(event)
        words = ef.getTopKeywords(k=10)
        return json.dumps(words)
    #getTopKeywords.exposed = True

    def setLabel(self, event_id, label):
        event = self.ei.getEventByID(str(event_id))
        print 'setting ',event_id, 'label = ',label
        #event['label'] = int(label)
        event['label'] = int(label)
        self.ei.updateDocument( event ) 
Пример #15
0
class Root:
    def __init__(self):
        self.ei = EventInterface()
        #self.ei.setDB('AmazonMT')
        #self.ei.setCollection('candidate_event_25by25_merged')

        self.ei.setDB('citybeat')
        #self.ei.setCollection('next_week_candidate_event_25by25_merged')
        self.ei.setCollection('online_candidate')

        #collection = 'candidate_event_25by25_merged'
        #self.c = Corpus()
        #self.c.buildCorpusOnDB('AmazonMT', collection)
        
        #collection = 'candidate_event_25by25_merged'
        #self.c = Corpus()
        #self.c.buildCorpusOnDB('citybeat', 'online_candidate')
        
        self._loadCrowdFlowerCode()

        self.cache_events = {}
        self.cache_photos = {}
        #self._cacheAll()

    def getAllEvents(self):
        event_cursor = self.ei.getAllDocuments()
        events = []
        #lines = open('./label_data_csv2.txt').readlines()
        #ok_ids = set()
        #for line in lines:
        #    t = line.split()
        #    if t[1]=='1':
        #        ok_ids.add( t[0] )
        #limit = 10
        tmp_events = [e for e in event_cursor]
        #tmp_events = tmp_events[-10:-1]
        for e in tmp_events:
            #if random.random()>0.5:
            #    continue
            #if str(e['_id']) not in ok_ids:
            #    continue
            #if limit==0:
            #    break
            #limit -= 1;
            #if e['label'] =='unlabeled':
            #    continue
            e['_id'] = str(e['_id'])
            e['urgency'] = 58
            e['volume'] = 99
            #e['photos'] = e['photos'][:min(5, len(e['photos']))] 
            e['stats'] = {'photos':50, 'tweets':0, 'checkins':0}

            events.append( e )
        return json.dumps(events)
    getAllEvents.exposed = True 
    
    def _loadCrowdFlowerCode(self):
        lines = open('crowdflower_code.txt').readlines()
        self.cf_code = {}
        for line in lines:
            t = line.split(',')
            self.cf_code[t[0]] = t[1]

    def getCrowdFlowerCode(self, event_id):
        if event_id in self.cf_code:
            return self.cf_code[event_id]
        else:
            return None
    getCrowdFlowerCode.exposed = True

    def getAllEventsIDs(self):
        object_ids = self.ei.getAllDocumentIDs()
        return_value = []
        for _id in object_ids:
            return_value.append( str(_id) )
        return json.dumps( return_value )
    #getAllEventsIDs.exposed = True
    
    def _deleteExtraMeta(self,photo):
        try: del photo['comments']
        except Exception as e: pass

        try: del photo['caption']['from']
        except Exception as e: pass
        try: del photo['filter']
        except Exception as e: pass
        try: del photo['user']
        except Exception as e: pass
        try: del photo['images']['standard_resolution']
        except Exception as e: pass
        try: del photo['images']['low_resolution']
        except Exception as e: pass
        try: del photo['likes']
        except Exception as e: pass
        try: del photo['likes']
        except Exception as e: pass
        return photo

    def getPhotosByID(self, event_id):
        if event_id in self.cache_photos:
            print 'cached. return directly'
            tmp = json.loads(self.cache_photos[event_id])
            to_return = []
            for idx in range(len(tmp)):
                tmp[idx][2] = [self._deleteExtraMeta(p) for p in tmp[idx][2] ]
            
            return json.dumps(tmp)
            #return self.cache_photos[event_id]
            #return self.cache_photos[event_id]
        event = self.ei.getEventByID(event_id)
        #event = EventFrontend(event, self.c)
            
        #words_pics_list = event.getTopKeywordsAndPhotos(10, 6)
        top_words_list = event.getTopKeywordsAndPhotos(20,5)
        words_pics_list = event.getTopKeywordsAndPhotosByTFIDF(20, 5)
        keywords_shown = set()
        
        res = []
        for tf, idf in zip(top_words_list,words_pics_list):
            if tf[0] not in keywords_shown:
                keywords_shown.add(tf[0])
                res.append(tf)
            if idf[0] not in keywords_shown:
                keywords_shown.add(idf[0])
                res.append(idf)
        
        r = json.dumps(res) 
        #r = json.dumps(words_pics_list + top_words_list)
        return r
    getPhotosByID.exposed = True
   
    def _cacheAll(self):
        print 'begin cache'
        all_events = self.getAllEvents()
        print type(all_events)
        all_events = json.loads(all_events)
        cnt = 0
        for e in all_events:
            cnt+=1
            if cnt%100 == 0:
                print cnt
            self.cache_events[e['_id']] = json.dumps(e)
        for e in all_events:
            cnt+=1
            if cnt%100 == 0:
                print cnt
            self.cache_photos[e['_id']] = self.getPhotosByID(e['_id'])
          

    def getEventByID(self, event_id):
        if event_id in self.cache_events:
            tmp = json.loads(self.cache_events[event_id])
            tmp['photos'] = [ self._deleteExtraMeta(p) for p in tmp['photos']]

            return json.dumps( tmp )
            print 'event cached. return directly'
            #return self.cache_events[event_id]

        event = self.ei.getEventByID(event_id)
        event['_id'] = str(event['_id'])
        return json.dumps(event)
    getEventByID.exposed = True
    
    def getTopKeywords(self, event_id):
        event = self.ei.getEventByID(event_id)
        ef = EventFeature(event)
        words = ef.getTopKeywords(k=10)
        return json.dumps(words)
    #getTopKeywords.exposed = True

    def setLabel(self, event_id, label):
        event = self.ei.getEventByID(str(event_id))
        print 'setting ',event_id, 'label = ',label
        #event['label'] = int(label)
        event['label'] = int(label)
        self.ei.updateDocument( event ) 
Пример #16
0
from utility.event_interface import EventInterface

ei = EventInterface()
ei.setDB('citybeat')
ei.setCollection('baseline_candidate_events')
p = ei.getPhotoDistributionArray()
print p
Пример #17
0
class Root:
    def __init__(self):
        self.ei = EventInterface()
        #self.ei.setDB('AmazonMT')
        #self.ei.setCollection('candidate_event_25by25_merged')

        self.ei.setDB('citybeat')
        #self.ei.setCollection('next_week_candidate_event_25by25_merged')
        self.ei.setCollection('online_candidate')

        #collection = 'candidate_event_25by25_merged'
        #self.c = Corpus()
        #self.c.buildCorpusOnDB('AmazonMT', collection)

        #collection = 'candidate_event_25by25_merged'
        #self.c = Corpus()
        #self.c.buildCorpusOnDB('citybeat', 'online_candidate')

        self._loadCrowdFlowerCode()

        self.cache_events = {}
        self.cache_photos = {}
        #self._cacheAll()

    def getAllEvents(self):
        event_cursor = self.ei.getAllDocuments()
        events = []
        #lines = open('./label_data_csv2.txt').readlines()
        #ok_ids = set()
        #for line in lines:
        #    t = line.split()
        #    if t[1]=='1':
        #        ok_ids.add( t[0] )
        #limit = 10
        tmp_events = [e for e in event_cursor]
        #tmp_events = tmp_events[-10:-1]
        for e in tmp_events:
            #if random.random()>0.5:
            #    continue
            #if str(e['_id']) not in ok_ids:
            #    continue
            #if limit==0:
            #    break
            #limit -= 1;
            #if e['label'] =='unlabeled':
            #    continue
            e['_id'] = str(e['_id'])
            e['urgency'] = 58
            e['volume'] = 99
            #e['photos'] = e['photos'][:min(5, len(e['photos']))]
            e['stats'] = {'photos': 50, 'tweets': 0, 'checkins': 0}

            events.append(e)
        return json.dumps(events)

    getAllEvents.exposed = True

    def _loadCrowdFlowerCode(self):
        lines = open('crowdflower_code.txt').readlines()
        self.cf_code = {}
        for line in lines:
            t = line.split(',')
            self.cf_code[t[0]] = t[1]

    def getCrowdFlowerCode(self, event_id):
        if event_id in self.cf_code:
            return self.cf_code[event_id]
        else:
            return None

    getCrowdFlowerCode.exposed = True

    def getAllEventsIDs(self):
        object_ids = self.ei.getAllDocumentIDs()
        return_value = []
        for _id in object_ids:
            return_value.append(str(_id))
        return json.dumps(return_value)

    #getAllEventsIDs.exposed = True

    def _deleteExtraMeta(self, photo):
        try:
            del photo['comments']
        except Exception as e:
            pass

        try:
            del photo['caption']['from']
        except Exception as e:
            pass
        try:
            del photo['filter']
        except Exception as e:
            pass
        try:
            del photo['user']
        except Exception as e:
            pass
        try:
            del photo['images']['standard_resolution']
        except Exception as e:
            pass
        try:
            del photo['images']['low_resolution']
        except Exception as e:
            pass
        try:
            del photo['likes']
        except Exception as e:
            pass
        try:
            del photo['likes']
        except Exception as e:
            pass
        return photo

    def getPhotosByID(self, event_id):
        if event_id in self.cache_photos:
            print 'cached. return directly'
            tmp = json.loads(self.cache_photos[event_id])
            to_return = []
            for idx in range(len(tmp)):
                tmp[idx][2] = [self._deleteExtraMeta(p) for p in tmp[idx][2]]

            return json.dumps(tmp)
            #return self.cache_photos[event_id]
            #return self.cache_photos[event_id]
        event = self.ei.getEventByID(event_id)
        #event = EventFrontend(event, self.c)

        #words_pics_list = event.getTopKeywordsAndPhotos(10, 6)
        top_words_list = event.getTopKeywordsAndPhotos(20, 5)
        words_pics_list = event.getTopKeywordsAndPhotosByTFIDF(20, 5)
        keywords_shown = set()

        res = []
        for tf, idf in zip(top_words_list, words_pics_list):
            if tf[0] not in keywords_shown:
                keywords_shown.add(tf[0])
                res.append(tf)
            if idf[0] not in keywords_shown:
                keywords_shown.add(idf[0])
                res.append(idf)

        r = json.dumps(res)
        #r = json.dumps(words_pics_list + top_words_list)
        return r

    getPhotosByID.exposed = True

    def _cacheAll(self):
        print 'begin cache'
        all_events = self.getAllEvents()
        print type(all_events)
        all_events = json.loads(all_events)
        cnt = 0
        for e in all_events:
            cnt += 1
            if cnt % 100 == 0:
                print cnt
            self.cache_events[e['_id']] = json.dumps(e)
        for e in all_events:
            cnt += 1
            if cnt % 100 == 0:
                print cnt
            self.cache_photos[e['_id']] = self.getPhotosByID(e['_id'])

    def getEventByID(self, event_id):
        if event_id in self.cache_events:
            tmp = json.loads(self.cache_events[event_id])
            tmp['photos'] = [self._deleteExtraMeta(p) for p in tmp['photos']]

            return json.dumps(tmp)
            print 'event cached. return directly'
            #return self.cache_events[event_id]

        event = self.ei.getEventByID(event_id)
        event['_id'] = str(event['_id'])
        return json.dumps(event)

    getEventByID.exposed = True

    def getTopKeywords(self, event_id):
        event = self.ei.getEventByID(event_id)
        ef = EventFeature(event)
        words = ef.getTopKeywords(k=10)
        return json.dumps(words)

    #getTopKeywords.exposed = True

    def setLabel(self, event_id, label):
        event = self.ei.getEventByID(str(event_id))
        print 'setting ', event_id, 'label = ', label
        #event['label'] = int(label)
        event['label'] = int(label)
        self.ei.updateDocument(event)
Пример #18
0
class Root:
    def __init__(self):
        self.ei = EventInterface()
        self.ei.setDB(InstagramConfig.event_db)
        #self.representor = Representor(db='citybeat_production', collection='instagram_front_end_events')
        
        self.ei.setCollection(InstagramConfig.front_end_events)
        self.stats_interface = StatsInterface()

    def getAllEvents(self):
        now = int(getCurrentStampUTC())
        two_days_before = now - 3 * 24 * 3600
        event_cursor = self.ei.getAllDocuments({'created_time':{'$gte':str(two_days_before)}})
        events = []
        for e in event_cursor:
            #representor
            #rep_photos = self.representor.getRepresentivePhotos(e)
            #e['photos'] = rep_photos[:min(5,len(rep_photos))]
            e['_id'] = str(e['_id'])
            e['urgency'] = 58
            e['volume'] = 99
            e['stats'] = {'photos':50, 'tweets':0, 'checkins':0}
            #print e['photos']
            if e['actual_value']>=6 and e['zscore']>3.0:
                events.append(e)
        events = sorted(events, key = lambda x:x['created_time'], reverse=True)
        for w in events:
            print w['created_time']
        events = events[:5]
        return json.dumps(events)
    getAllEvents.exposed = True 
    
    def getAllEventsIDs(self):
        object_ids = self.ei.getAllDocumentIDs()
        return_value = []
        for _id in object_ids:
            return_value.append( str(_id) )
        return json.dumps( return_value )
    #getAllEventsIDs.exposed = True
    
    def getPhotosByID(self, event_id):
        event = json.loads(self.getEventByID(event_id))
        res = []
        all_photos = []
        top10_photos = []
        all_photos.append('all_photos')
        all_photos.append(len(event['photos']))
        all_photos.append( event['photos'])
        rep_photos = event['photos']
        top10_photos.append('top_10_representative')
        top10_photos.append(min(10, len(rep_photos)))
        top10_photos.append(rep_photos)
   
        res.append(all_photos)
        res.append(top10_photos)
        r = json.dumps(res) 
        return r
    getPhotosByID.exposed = True
   
    def getEventByID(self, event_id):
        event = self.ei.getEventByID(event_id)
        event = Event(event)
        event.selectOnePhotoForOneUser()
        event_dic = event.toDict()
        event_dic['_id'] = str(event_dic['_id'])
        return json.dumps(event_dic)
    getEventByID.exposed = True
    
    def setLabel(self, event_id, label):
        event = self.ei.getEventByID(str(event_id))
        event['label'] = int(label)
        self.ei.updateDocument( event )
    #setLabel.exposed = True

    def getLatestStats(self):
        now = int(getCurrentStampUTC()) - 5 * 60
        condition = {'created_time': {"$gte": str(now)}}
        most_recent_stats = self.stats_interface.getAllDocuments(condition=condition).sort('created_time', -1)[0]
        most_recent_stats['_id'] = str(most_recent_stats['_id'])
        return json.dumps(most_recent_stats)
    getLatestStats.exposed = True
Пример #19
0
class Root:
    def __init__(self):
        self.ei = EventInterface()
        self.ei.setDB('AmazonMT')
        self.ei.setCollection('candidate_event_25by25_merged')
        self.representor = Representor()
        #self.ei.setDB('citybeat')
        #self.ei.setCollection('next_week_candidate_event_25by25_merged')
        #self.ei.setCollection('online_candidate')

        self._loadCrowdFlowerCode()

    def getAllEvents(self):
        event_cursor = self.ei.getAllDocuments()
        events = []
        tmp_events = [e for e in event_cursor]
        for e in tmp_events:
            if len(e['photos']) > 3:
                if random.random() > 0.1:
                    e['_id'] = str(e['_id'])
                    e['urgency'] = 58
                    e['volume'] = 99
                    e['stats'] = {'photos': 50, 'tweets': 0, 'checkins': 0}
                    rep_photos = self.representor.getRepresentivePhotos(e)
                    e['photos'] = rep_photos[:min(5, len(rep_photos))]
                    events.append(e)
        return json.dumps(events)

    getAllEvents.exposed = True

    def _loadCrowdFlowerCode(self):
        lines = open('crowdflower_code.txt').readlines()
        self.cf_code = {}
        for line in lines:
            t = line.split(',')
            self.cf_code[t[0]] = t[1]

    def getCrowdFlowerCode(self, event_id):
        if event_id in self.cf_code:
            return self.cf_code[event_id]
        else:
            return None

    getCrowdFlowerCode.exposed = True

    def getAllEventsIDs(self):
        object_ids = self.ei.getAllDocumentIDs()
        return_value = []
        for _id in object_ids:
            return_value.append(str(_id))
        return json.dumps(return_value)

    #getAllEventsIDs.exposed = True

    def _deleteExtraMeta(self, photo):
        try:
            del photo['comments']
        except Exception as e:
            pass

        try:
            del photo['caption']['from']
        except Exception as e:
            pass
        try:
            del photo['filter']
        except Exception as e:
            pass
        try:
            del photo['user']
        except Exception as e:
            pass
        try:
            del photo['images']['standard_resolution']
        except Exception as e:
            pass
        try:
            del photo['images']['low_resolution']
        except Exception as e:
            pass
        try:
            del photo['likes']
        except Exception as e:
            pass
        try:
            del photo['likes']
        except Exception as e:
            pass
        return photo

    def getPhotosByID(self, event_id):
        event = json.loads(self.getEventByID(event_id))
        #event = EventFrontend(event, self.c)

        #top_words_list = event.getTopKeywordsAndPhotos(20,5)
        #words_pics_list = event.getTopKeywordsAndPhotosByTFIDF(20, 5)
        #keywords_shown = set()

        res = []

        all_photos = []
        top10_photos = []
        all_photos.append('all_photos')
        #print event['photos']
        all_photos.append(len(event['photos']))
        all_photos.append(event['photos'])

        rep_photos = self.representor.getRepresentivePhotos(event)
        rep_photos = rep_photos[:10]
        top10_photos.append('top_10_representative')
        top10_photos.append(min(10, len(rep_photos)))
        top10_photos.append(rep_photos)

        res.append(all_photos)
        res.append(top10_photos)
        """
        for tf, idf in zip(top_words_list,words_pics_list):
            if tf[0] not in keywords_shown:
                keywords_shown.add(tf[0])
                res.append(tf)
            if idf[0] not in keywords_shown:
                keywords_shown.add(idf[0])
                res.append(idf)
        """
        r = json.dumps(res)
        #print r
        #r = json.dumps(words_pics_list + top_words_list)
        return r

    getPhotosByID.exposed = True

    def _cacheAll(self):
        print 'begin cache'
        all_events = self.getAllEvents()
        print type(all_events)
        all_events = json.loads(all_events)
        cnt = 0
        for e in all_events:
            cnt += 1
            if cnt % 100 == 0:
                print cnt
            self.cache_events[e['_id']] = json.dumps(e)
        for e in all_events:
            cnt += 1
            if cnt % 100 == 0:
                print cnt
            self.cache_photos[e['_id']] = self.getPhotosByID(e['_id'])

    def getEventByID(self, event_id):
        event = self.ei.getEventByID(event_id)
        event = Event(event)
        event.selectOnePhotoForOneUser()
        event_dic = event.toJSON()
        event_dic['_id'] = str(event_dic['_id'])
        return json.dumps(event_dic)

    getEventByID.exposed = True

    def getTopKeywords(self, event_id):
        event = self.ei.getEventByID(event_id)
        ef = EventFeature(event)
        words = ef.getTopKeywords(k=10)
        return json.dumps(words)

    #getTopKeywords.exposed = True

    def setLabel(self, event_id, label):
        event = self.ei.getEventByID(str(event_id))
        print 'setting ', event_id, 'label = ', label
        #event['label'] = int(label)
        event['label'] = int(label)
        self.ei.updateDocument(event)