示例#1
0
 def __init__(self, vectorizer = None, db='AmazonMT', collection='candidate_event_25by25_merged'):
     """Given an event, return a list incices of the photos in 'photos' filed 
     which are representative to stands for this cluster
     
     Could overwrite TfidfVectorizer as a parameter so that you could customize
     your own tfidf parameters. 
     see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
     """
     
     self.ei = EventInterface()
     self.ei.setDB(db)
     self.ei.setCollection(collection)
     self.events = []
     for e in self.ei.getAllDocuments():
         event = Event(e)
         event.selectOnePhotoForOneUser()
         e = event.toJSON()
         self.events.append(e)
     #self.events = [e for e in self.ei.getAllDocuments()]
     self._captions = self._getAllCaptions()
     
     if vectorizer is None:
         self.vectorizer = TfidfVectorizer( max_df=0.05, min_df = 1, strip_accents='ascii', smooth_idf=True, preprocessor = self._preProcessor, sublinear_tf=True, norm = 'l2', analyzer='char_wb', ngram_range=(4,4), stop_words = 'english')
     else:
         self.vectorizer = vectorizer
     self.vectorizer.fit_transform(self._captions)
示例#2
0
def test():
    ei = EventInterface(collection='instagram_front_end_events')
    cur = ei.getAllDocuments(limit=2)
    for e in cur:
        e = PhotoEvent(e)
        print e.getID()
        print e.getAllPhotoImageUrls()
示例#3
0
def findLastWeekEvents():
    ei = EventInterface()
    ei.setCollection(InstagramConfig.front_end_events)

    conditions = {'created_time':{'$gte':'1381228200'}}
    fields = ['_id']
    cur = ei.getAllFields(fields=fields, condition=conditions)


    with open('all_classified_events_from_10_09_to_10_15.csv', 'wb') as csvfile:
        event_writer = csv.writer(csvfile, delimiter=',')
        events = []
        for event in cur:
            url = 'http://ec2-23-22-67-45.compute-1.amazonaws.com/cb/event/' + str(event['_id'])
            events.append([url])
        event_writer.writerows(events)
示例#4
0
def findTree():
    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25_merged')

    events = {}
    fid1 = open('labeled_data_cf/181_positive.txt', 'r')
    true_events = []

    for line in fid1:
        t = line.split(',')
        id = str(t[0])
        label = int(t[1])
        if label == 1:
            pass
        else:
            continue
        events[id] = label

    fid1.close()

    words = ['motor']
    for id, label in events.items():
        event = ei.getEventByID(id)
        e = Event(event)
        if e.containKeywords(words, 1):
            print id
示例#5
0
def generateData(biased=True):
    ei = EventInterface()
    ei.setDB('historic_alarm')
    ei.setCollection('labeled_event')
    events = ei.getAllDocuments()

    EventFeature.GenerateArffFileHeader()
    true_events = []
    false_events = []
    for event in events:
        event = EventFeature(event)
        feature_vector = event.extractFeatures(3)
        if feature_vector[-1] == 1:
            true_events.append(feature_vector)
        else:
            false_events.append(feature_vector)

    random.shuffle(false_events)

    for fv in true_events:
        for i in xrange(0, len(fv) - 1):
            print fv[i], ',',
        print fv[-1]

    j = 0
    for fv in false_events:
        for i in xrange(0, len(fv) - 1):
            print fv[i], ',',
        print fv[-1]
        j += 1
        if not biased and j == len(true_events):
            break
示例#6
0
def loadNextWeekData():

    # load modified

    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('next_week_candidate_event_25by25_merged')

    true_events = []
    false_events = []

    fid2 = open('labeled_data_cf/label_next_week.txt', 'r')

    for line in fid2:
        t = line.split(',')
        id = str(t[0])
        label = int(t[1])

        event = ei.getDocument({'_id': ObjectId(id)})
        event['label'] = label
        e = Event(event)
        if e.getActualValue() < 8 or event['label'] == 0:
            #			print 'bad event ' + id
            continue
        if event['label'] == 1:
            true_events.append(event)
        else:
            false_events.append(event)

    fid2.close()
    return true_events, false_events
示例#7
0
	def buildCorpusOnDB(self, db, collection):
		ei = EventInterface()
		ei.setDB(db)
		ei.setCollection(collection)
		events = ei.getAllDocuments()
		for event in events:
			word_list = self.getWordList(event)
			self._addDocument(word_list)
示例#8
0
def main():
    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25_merged')
    events = ei.getAllDocuments()

    event = ei.getEventByID('511478c8c2a3754cfe6684a9')
    print event['region']

    lat = (event['region']['min_lat'] + event['region']['max_lat']) / 2
    lon = (event['region']['min_lng'] + event['region']['max_lng']) / 2
    fid1 = open('region_cache/25_25.txt', 'r')
    for line in fid1:
        cor = line.split(' ')
        for i in xrange(len(cor)):
            cor[i] = float(cor[i])
        if float(cor[0]) <= lat and lat <= float(cor[2]) and float(
                cor[1]) <= lon and lon <= float(cor[3]):
            min_lat = cor[0]
            max_lat = cor[2]
            min_lng = cor[1]
            max_lng = cor[3]
            print min_lat, max_lat, min_lng, max_lng
            break
    fid1.close()

    fid2 = open('labeled_data_cf/181_positive.txt', 'r')

    labels = {}

    for line in fid2:
        t = line.split(',')
        labels[str(t[0])] = int(t[1])
    fid2.close()

    pos = 0
    tot = 0
    for event in events:
        region = event['region']
        id = str(event['_id'])
        if id not in labels.keys():
            continue

        if (floatEqual(region['min_lat'], min_lat)
                and floatEqual(region['max_lat'], max_lat)
                and floatEqual(region['min_lng'], min_lng)
                and floatEqual(region['max_lng'], max_lng)):
            tot += 1
            if labels[id] == 1:
                pos += 1
                print id
    print pos
    print tot
示例#9
0
def loadUnbalancedData(_182):

    # load modified

    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25_merged')

    true_events = []
    false_events = []
    if _182:
        fid2 = open('labeled_data_cf/182_positive.txt', 'r')
    else:
        fid2 = open('labeled_data_cf/181_positive.txt', 'r')

    modified_events = {}

    for line in fid2:
        t = line.split(',')
        modified_events[str(t[0])] = int(t[1])
    fid2.close()

    # put the data into a text file first
    fid = open('labeled_data_cf/data2.txt', 'r')
    for line in fid:
        if len(line.strip()) == 0:
            continue
        t = line.strip().split()
        if not len(t) == 3:
            continue
        label = t[0].lower()
        confidence = float(t[1])
        event_id = str(t[2].split('/')[-1])
        if label == 'not_sure':
            continue
        if label == 'yes':
            label = 1
        else:
            label = -1
        event = ei.getDocument({'_id': ObjectId(event_id)})
        event['label'] = label
        if modified_events.has_key(event_id):
            event['label'] = modified_events[event_id]

        e = Event(event)
        if e.getActualValue() < 8 or event['label'] == 0:
            #			print 'bad event ' + id
            continue
        if event['label'] == 1:
            true_events.append(event)
        else:
            if event['label'] == -1 and confidence == 1:
                false_events.append(event)

    fid.close()
    return true_events, false_events
示例#10
0
def findTree():
    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25_merged')

    events = {}
    fid1 = open('labeled_data_cf/181_positive.txt', 'r')
    true_events = []

    for line in fid1:
        t = line.split(',')
        id = str(t[0])
        label = int(t[1])
        if label == 1:
            pass
        else:
            continue
        events[id] = label

    fid1.close()

    words = ['motor']
    for id, label in events.items():
        event = ei.getEventByID(id)
        e = Event(event)
        if e.containKeywords(words, 1):
            print id
def loadNextWeekData():
	
	# load modified 
	
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('next_week_candidate_event_25by25_merged')
	
	true_events = []
	false_events = []
	
	fid2 = open('labeled_data_cf/label_next_week.txt', 'r')
	
	for line in fid2:
		t = line.split(',')
		id = str(t[0])
		label = int(t[1])
		
		event = ei.getDocument({'_id':ObjectId(id)})
		event['label'] = label
		e = Event(event)
		if e.getActualValue() < 8 or event['label'] == 0:
#			print 'bad event ' + id
			continue
		if event['label'] == 1:
			true_events.append(event)
		else:
			false_events.append(event)
			
	fid2.close()
	return true_events, false_events
示例#12
0
def generateData(biased=True):
    ei = EventInterface()
    ei.setDB("historic_alarm")
    ei.setCollection("labeled_event")
    events = ei.getAllDocuments()

    EventFeature.GenerateArffFileHeader()
    true_events = []
    false_events = []
    for event in events:
        event = EventFeature(event)
        feature_vector = event.extractFeatures(3)
        if feature_vector[-1] == 1:
            true_events.append(feature_vector)
        else:
            false_events.append(feature_vector)

    random.shuffle(false_events)

    for fv in true_events:
        for i in xrange(0, len(fv) - 1):
            print fv[i], ",",
        print fv[-1]

    j = 0
    for fv in false_events:
        for i in xrange(0, len(fv) - 1):
            print fv[i], ",",
        print fv[-1]
        j += 1
        if not biased and j == len(true_events):
            break
示例#13
0
def loadUnbalancedData(_182):
	
	# load modified 
	
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('candidate_event_25by25_merged')
	
	true_events = []
	false_events = []
	if _182:
		fid2 = open('labeled_data_cf/182_positive.txt', 'r')
	else:
		fid2 = open('labeled_data_cf/181_positive.txt', 'r')
		
	modified_events = {}
	
	for line in fid2:
		t = line.split(',')
		modified_events[str(t[0])] = int(t[1])
	fid2.close()
		
	# put the data into a text file first
	fid = open('labeled_data_cf/data2.txt','r')
	for line in fid:
		if len(line.strip()) == 0:
			continue
		t = line.strip().split()
		if not len(t) == 3:
			continue
		label = t[0].lower()
		confidence = float(t[1])
		event_id = str(t[2].split('/')[-1])
		if label == 'not_sure':
			continue
		if label == 'yes':
			label = 1
		else:
			label = -1
		event = ei.getDocument({'_id':ObjectId(event_id)})
		event['label'] = label
		if modified_events.has_key(event_id):
			event['label'] = modified_events[event_id]
		
		e = Event(event)
		if e.getActualValue() < 8 or event['label'] == 0:
#			print 'bad event ' + id
			continue
		if event['label'] == 1:
			true_events.append(event)
		else:
			if event['label'] == -1 and confidence == 1:
				false_events.append(event)
			
	fid.close()
	return true_events, false_events
示例#14
0
def main():
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('candidate_event_25by25_merged')
	events = ei.getAllDocuments()
	
	
	event = ei.getEventByID('511478c8c2a3754cfe6684a9')
	print event['region']
		
	lat = (event['region']['min_lat'] + event['region']['max_lat'])/2
	lon = (event['region']['min_lng'] + event['region']['max_lng'])/2
	fid1 = open('region_cache/25_25.txt', 'r')
	for line in fid1:
		cor = line.split(' ')
		for i in xrange(len(cor)):
			cor[i] = float(cor[i])
		if float(cor[0]) <= lat and lat <= float(cor[2]) and float(cor[1]) <= lon and lon <= float(cor[3]):
			min_lat = cor[0]
			max_lat = cor[2]
			min_lng = cor[1]
			max_lng = cor[3]
			print min_lat, max_lat, min_lng, max_lng
			break
	fid1.close()
	
	fid2 = open('labeled_data_cf/181_positive.txt', 'r')
		
	labels = {}
	
	for line in fid2:
		t = line.split(',')
		labels[str(t[0])] = int(t[1])
	fid2.close()
	
	pos = 0
	tot = 0
	for event in events:
		region = event['region']
		id = str(event['_id'])
		if id not in labels.keys():
			continue
		
		if (floatEqual(region['min_lat'], min_lat) and floatEqual(region['max_lat'], max_lat)
		   and floatEqual(region['min_lng'], min_lng) and floatEqual(region['max_lng'], max_lng)):
		  tot += 1
		  if labels[id] == 1:
		  	pos += 1
		  	print id
	print pos
	print tot
def testWithPhoto():
    corpus_all = buildAllCorpus(element_type='photos', debug=True)
    for key, corpus in corpus_all.items():
        break

    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25_merged')
    event = ei.getDocument()
    event = BaseFeatureProduction(event, corpus=corpus)
    print event.extractFeatures()
示例#16
0
def getAllActualEvents():
    ei = EventInterface()
    ei.setDB("citybeat")
    ei.setCollection("candidate_event_25by25_merged")

    true_events = []
    false_events = []
    fid2 = open("labeled_data_cf/181_positive.txt", "r")

    modified_events = {}

    for line in fid2:
        t = line.split(",")
        modified_events[str(t[0])] = int(t[1])
    fid2.close()

    # put the data into a text file first
    fid = open("labeled_data_cf/data2.txt", "r")
    for line in fid:
        if len(line.strip()) == 0:
            continue
        t = line.strip().split()
        if not len(t) == 3:
            continue
        label = t[0].lower()
        confidence = float(t[1])
        event_id = str(t[2].split("/")[-1])
        if label == "not_sure":
            continue
        if label == "yes":
            label = 1
        else:
            label = -1
        event = ei.getDocument({"_id": ObjectId(event_id)})
        event["label"] = label
        if modified_events.has_key(event_id):
            event["label"] = modified_events[event_id]

        e = Event(event)
        if e.getActualValue() < 8 or event["label"] == 0:
            #           print 'bad event ' + id
            continue
        if event["label"] == 1:
            true_events.append(event)

    fid.close()
    return true_events
示例#17
0
def testWithTweet():
    cnt = 0
    corpus_all = buildAllCorpus(element_type='tweets', debug=False)
    ei = EventInterface()
    ei.setDB('citybeat_experiment')
    ei.setCollection('twitter_candidate_events')
    cur = ei.getAllDocuments()
    print TwitterFeature.GenerateArffFileHeader()
    for event in cur:
        region = Region(event['region'])
        event = TwitterFeature(event, corpus=corpus_all[region.getKey()])
        if event.getActualValue() < 8:
            print '< 8'
            continue
        cnt += 1
        print event.extractFeatures()
    print  cnt, cur.count()
示例#18
0
    def __init__(self, vectorizer=None, db="AmazonMT", collection="candidate_event_25by25_merged"):
        """Given an event, return a list incices of the photos in 'photos' filed 
        which are representative to stands for this cluster
        
        Could overwrite TfidfVectorizer as a parameter so that you could customize
        your own tfidf parameters. 
        see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
        """

        self.ei = EventInterface()
        self.ei.setDB(db)
        self.ei.setCollection(collection)
        self.events = []
        for e in self.ei.getAllDocuments():
            event = Event(e)
            event.selectOnePhotoForOneUser()
            e = event.toJSON()
            self.events.append(e)
        # self.events = [e for e in self.ei.getAllDocuments()]
        self._captions = self._getAllCaptions()

        if vectorizer is None:
            self.vectorizer = TfidfVectorizer(
                max_df=0.05,
                min_df=1,
                strip_accents="ascii",
                smooth_idf=True,
                preprocessor=self._preProcessor,
                sublinear_tf=True,
                norm="l2",
                analyzer="char_wb",
                ngram_range=(4, 4),
                stop_words="english",
            )
        else:
            self.vectorizer = vectorizer
        self.vectorizer.fit_transform(self._captions)
示例#19
0
def generateTrueLabelFile():
    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25_merged')

    events = {}
    fid1 = open('labeled_data_cf/181_positive.txt', 'r')
    true_events = []
    false_events = []
    unknown_events = []

    for line in fid1:
        t = line.split(',')
        id = str(t[0])
        label = int(t[1])
        events[id] = label

    fid1.close()

    for id, label in events.items():
        event = ei.getDocument({'_id': ObjectId(id)})
        event['label'] = label
        e = Event(event)
        if e.getActualValue() < 8:
            #			print 'bad event ' + id
            continue
        if event['label'] == -1:
            false_events.append(event)
        else:
            if event['label'] == 1:
                true_events.append(event)
            else:
                unknown_events.append(event)

    for event in true_events + false_events + unknown_events:
        print str(event['_id']) + ',' + str(event['label'])
示例#20
0
def testWithTweet():
    from corpus import buildAllCorpus

    corpus_all = buildAllCorpus(element_type="tweets", debug=True)
    for key, corpus in corpus_all.items():
        break

    ei = EventInterface()
    ei.setDB("citybeat")
    ei.setCollection("candidate_event_25by25_merged")
    event = ei.getDocument()
    print event
    ti = TweetInterface()
    cur = ti.getAllDocuments(limit=30)
    tweets = []
    for tweet in cur:
        tweets.append(tweet)
    del event["photos"]
    event["tweets"] = tweets
    event = BaseFeature(event, corpus=corpus)
    print event.printFeatures()
示例#21
0
def getBaselineEvents():
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('baseline_candidate_events')
	
	events = ei.getAllDocuments()
	
	event_list = []
	
	for event in events:
		e = Event(event)
		if e.getActualValue() < 8 or e.getZscore() < 3:
			continue
		event_list.append(event)
	
#	print len(event_list)
	
#	return 
	
	random.shuffle(event_list)
	
	for i in xrange(50):
		print event_list[i]['_id']	
示例#22
0
def generateTrueLabelFile():
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('candidate_event_25by25_merged')
	
	events = {}
	fid1 = open('labeled_data_cf/181_positive.txt', 'r')
	true_events = []
	false_events = []
	unknown_events = []
	
	for line in fid1:
		t = line.split(',')
		id = str(t[0])
		label = int(t[1])
		events[id] = label
		
	fid1.close()
	
	for id, label in events.items():
		event = ei.getDocument({'_id':ObjectId(id)})
		event['label'] = label
		e = Event(event)
		if e.getActualValue() < 8:
#			print 'bad event ' + id
			continue
		if event['label'] == -1:
			false_events.append(event)
		else:
			if event['label'] == 1:
				true_events.append(event)
			else:
				unknown_events.append(event)
	
	
	for event in true_events + false_events + unknown_events:
		print str(event['_id'])+','+str(event['label'])
from event_interface import EventInterface
from bson.objectid import ObjectId
from event_feature import EventFeature


ei = EventInterface()
ei.setDB('historic_alarm')
ei.setCollection('raw_event')


ei2 = EventInterface()
ei2.setDB('historic_alarm')
ei2.setCollection('labeled_event')


#fid = open('final_labels.txt', 'r')
#
#for line in fid:
#	vals = line.split()
#	label = -1
#	if len(vals) > 1 and vals[1] == '1':
#		label = 1
#	event = ei.getDocument({'_id':ObjectId(vals[0])})
#	event['label'] = label
#	ei2.updateDocument(event)

events = ei2.getAllDocuments()
for event in events:
	label = event['label']
	event = EventFeature(event)
	(lat, lng) = event._getPhotoAvgLocation()
示例#24
0
from event_interface import EventInterface
from bson.objectid import ObjectId
from event_feature import EventFeature

ei = EventInterface()
ei.setDB('historic_alarm')
ei.setCollection('raw_event')

ei2 = EventInterface()
ei2.setDB('historic_alarm')
ei2.setCollection('labeled_event')

#fid = open('final_labels.txt', 'r')
#
#for line in fid:
#	vals = line.split()
#	label = -1
#	if len(vals) > 1 and vals[1] == '1':
#		label = 1
#	event = ei.getDocument({'_id':ObjectId(vals[0])})
#	event['label'] = label
#	ei2.updateDocument(event)

events = ei2.getAllDocuments()
for event in events:
    label = event['label']
    event = EventFeature(event)
    (lat, lng) = event._getPhotoAvgLocation()
    print lat, lng, label
示例#25
0
from datetime import datetime

from event_interface import EventInterface


def getDate(utc_time):
    return repr(datetime.fromtimestamp(int(utc_time)))


ei = EventInterface()
ei.setDB('citybeat')
ei.setCollection('next_week_candidate_event_25by25')

ei2 = EventInterface()
ei2.setDB('citybeat')
ei2.setCollection('next_week_candidate_event_25by25_merged')

events = ei.getAllDocuments().sort('created_time', 1)
for event in events:
    if event['actual_value'] >= 8 and event['zscore'] >= 3.0:
        ei2.addEvent(event)



        #region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847}
        #utc_time = str(1354728300)<div style="text-align: left"></div>

        #region = {'min_lat': 40.730320599999999, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.743583800000003}
        #utc_time = str(1354340400)
        #
        #condition = ({'region.min_lat':region['min_lat'],
示例#26
0
			k = min(len(photos), k)
			# discard the keywords with only one photo
#			if k == 1:
#				break
			res.append([word, fre, photos[0:k]])
		return res
	
	def getTopKeywordsAndPhotos(self, num_keywords, num_photos):
		keywords = self._getTopKeywordsWithoutStopwords(num_keywords)
		return self._getRandomPhotosAssociatedWithKeywords(keywords, num_photos)
	
	def getTopKeywordsAndPhotosByTFIDF(self, num_keywords, num_photos):
		keywords = self._getTopKeywordsWithoutStopwords(100000)
		keywords = self._corpus.chooseTopWordWithHighestTDIDF(keywords, num_keywords)
		return self._getRandomPhotosAssociatedWithKeywords(keywords, num_photos)
			
if __name__=='__main__':
	
	collection = 'candidate_event_10by10_merged'
	
	c = Corpus()
	c.buildCorpusOnDB('citybeat', collection)
	
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection(collection)
	events = ei.getAllDocuments()
	for event in events:
		event = EventFrontend(event, c)
		print event.getTopKeywordsAndPhotosByTFIDF(10,0)
示例#27
0
from event_interface import EventInterface

ei = EventInterface()
ei.setDB('citybeat')
ei.setCollection('candidate_event_25by25_merged')

ei2 = EventInterface()
ei2.setDB('AmazonMT')
ei2.setCollection('candidate_event_25by25_merged')

events = ei.getAllDocuments()
for event in events:
    ei2.saveDocument(event)
示例#28
0
from event_interface import EventInterface


ei = EventInterface()
ei.setDB('citybeat')
ei.setCollection('candidate_event_25by25_merged')

ei2 = EventInterface()
ei2.setDB('AmazonMT')
ei2.setCollection('candidate_event_25by25_merged')

events = ei.getAllDocuments()
for event in events:
	ei2.saveDocument(event)
示例#29
0
def insertEvents():
    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25_merged')

    ei2 = EventInterface()
    ei2.setDB('citybeat')
    ei2.setCollection('online_candidate')

    ids = [
        '51148288c2a3754cfe668edd', '51147952c2a3754cfe6684ee',
        '51148a7ec2a3754cfe669977', '51147967c2a3754cfe668503'
    ]

    for id in ids:
        event = ei.getDocument({'_id': ObjectId(id)})
        ei2.addEvent(event)
示例#30
0
from event_interface import EventInterface

from prediction_interface import PredictionInterface

from region import Region

from bson.objectid import ObjectId
from event import Event

from datetime import datetime

def getDate(utc_time):
	return repr(datetime.fromtimestamp(int(utc_time)))

ei = EventInterface()
ei.setDB('citybeat')
ei.setCollection('next_week_candidate_event_25by25')

ei2 = EventInterface()
ei2.setDB('citybeat')
ei2.setCollection('next_week_candidate_event_25by25_merged')

events = ei.getAllDocuments().sort('created_time', 1)
for event in events:
	if event['actual_value']  >= 8 and event['zscore'] >= 3.0:
		ei2.addEvent(event)



#region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847}
#utc_time = str(1354728300)<div style="text-align: left"></div>
from event_interface import EventInterface
from event_feature import EventFeature
from photo_interface import PhotoInterface
from photo import Photo
from region import Region
from event import Event
from caption_parser import CaptionParser
from stopwords import Stopwords

import operator
import string
import types
import random
import math

ei = EventInterface()
ei.setDB('AmazonMT')
ei.setCollection('candidate_event_25by25_merged')

events = ei.getAllDocuments()

duplicates = 0
for event in events:
	e = Event(event)
	flag = e.removeDuplicatePhotos()
	if flag > 0:
		print e.getPhotoNumber(), e.getActualValue()
		ei.updateDocument(e)
示例#32
0
class Representor():
    def __init__(self, vectorizer = None, db='AmazonMT', collection='candidate_event_25by25_merged'):
        """Given an event, return a list incices of the photos in 'photos' filed 
        which are representative to stands for this cluster
        
        Could overwrite TfidfVectorizer as a parameter so that you could customize
        your own tfidf parameters. 
        see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
        """
        
        self.ei = EventInterface()
        self.ei.setDB(db)
        self.ei.setCollection(collection)
        self.events = []
        for e in self.ei.getAllDocuments():
            event = Event(e)
            event.selectOnePhotoForOneUser()
            e = event.toJSON()
            self.events.append(e)
        #self.events = [e for e in self.ei.getAllDocuments()]
        self._captions = self._getAllCaptions()
        
        if vectorizer is None:
            self.vectorizer = TfidfVectorizer( max_df=0.05, min_df = 1, strip_accents='ascii', smooth_idf=True, preprocessor = self._preProcessor, sublinear_tf=True, norm = 'l2', analyzer='char_wb', ngram_range=(4,4), stop_words = 'english')
        else:
            self.vectorizer = vectorizer
        self.vectorizer.fit_transform(self._captions)
#        print self.vectorizer.get_feature_names()
    def _preProcessor(self, caption):
        regex = re.compile(r"#\w+")
        match = regex.findall(caption)
        if len(match)>=5:
            return ""
        else:
            return caption

    def _getAllCaptions(self):
        _captions = []
        for event in self.events:
            _captions += self._getEventCaptions(event)
        return _captions

    def _is_ascii(self, _str):
        return all(ord(c) < 128 for c in _str)

    def _getEventCaptions(self, event):
        """For a given event, return the captions as a list. Note for photo without caption,
        use a None to hold the place"""
        event_captions = []
        for p in event['photos']:
            try:
                if self._is_ascii(p['caption']['text']):
                    event_captions.append( p['caption']['text'].lower() )
                else:
                    event_captions.append("")
            except:
                event_captions.append( "" )
        return event_captions 
    def _cosine_sim(self, a, b):
        return a*b.T
    
    def getRepresentivePhotos(self, event):
        
        event_captions = self._getEventCaptions(event)
        event_tfidf = self.vectorizer.transform(event_captions)
        
        centroid = event_tfidf.mean(axis=0)
        #cosine_similarities = linear_kernel(centroid, event_tfidf).flatten()
        cosine_similarities = np.asarray(self._cosine_sim(centroid, event_tfidf)).flatten()

        most_related_pics = cosine_similarities.argsort()
        photos_to_return = []
        #print event['_id']
        for idx in most_related_pics:
#            print cosine_similarities[idx], event['photos'][idx]['link']
            photos_to_return.append( event['photos'][idx] )
        photos_to_return.reverse() 

        return photos_to_return 

    def getTfidfVector(self, event):
        voc = self.vectorizer.get_feature_names()
        tf_vec = self.vectorizer.transform(self._getEventCaptions(event)).mean(axis=0)

        nonzeros = np.nonzero(tf_vec)[1]
        res_list = nonzeros.ravel().tolist()[0] 

        values = []
        words = []
        for n in res_list:
            words.append( voc[n] )
            values.append( tf_vec[0,n] )

        return res_list, words, values

    def getCorpusWordsVector(self):
        return self.vectorizer.get_feature_names()
示例#33
0
def mergeBaselineEvents():
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('baseline_candidate_events')
	
	ei2 = EventInterface()
	ei2.setDB('citybeat')
	ei2.setCollection('baseline_candidate_events_merged')
	
	events = ei.getAllDocuments()
	
	for event in events:
		ei2.addEvent(event)
示例#34
0
from event_interface import EventInterface

from prediction_interface import PredictionInterface

from region import Region

from bson.objectid import ObjectId
from event import Event

from datetime import datetime
import random

n = 300
ei = EventInterface()
ei.setCollection('candidate_event_10by10_merged')
events = ei.getAllDocuments()
event_list = []

for event in events:
	event_list.append(event)

random.shuffle(event_list)


ei2 = EventInterface()
ei2.setDB('label')
ei2.setCollection('label_10by10')

i = 0
for event in event_list:
	ei2.saveDocument(event)
示例#35
0
def testWithMerge():
    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25')

    ei2 = EventInterface()
    ei2.setDB('test')
    ei2.setCollection('candidate_event')

    cur = ei.getAllDocuments()
    for event in cur:
        ei2.addEvent(event)
示例#36
0
            res.append([word, fre, photos[0:k]])
        return res

    def getTopKeywordsAndPhotos(self, num_keywords, num_photos):
        keywords = self._getTopKeywordsWithoutStopwords(num_keywords)
        return self._getRandomPhotosAssociatedWithKeywords(
            keywords, num_photos)

    def getTopKeywordsAndPhotosByTFIDF(self, num_keywords, num_photos):
        keywords = self._getTopKeywordsWithoutStopwords(100000)
        keywords = self._corpus.chooseTopWordWithHighestTDIDF(
            keywords, num_keywords)
        return self._getRandomPhotosAssociatedWithKeywords(
            keywords, num_photos)


if __name__ == '__main__':

    collection = 'candidate_event_10by10_merged'

    c = Corpus()
    c.buildCorpusOnDB('citybeat', collection)

    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection(collection)
    events = ei.getAllDocuments()
    for event in events:
        event = EventFrontend(event, c)
        print event.getTopKeywordsAndPhotosByTFIDF(10, 0)
示例#37
0
def insertEvents():
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('candidate_event_25by25_merged')
	
	ei2 = EventInterface()
	ei2.setDB('citybeat')
	ei2.setCollection('online_candidate')
	
	ids = ['51148288c2a3754cfe668edd', '51147952c2a3754cfe6684ee',
	       '51148a7ec2a3754cfe669977', '51147967c2a3754cfe668503']
	
	for id in ids:
		event = ei.getDocument({'_id':ObjectId(id)})
		ei2.addEvent(event)	
示例#38
0
from event_interface import EventInterface

from prediction_interface import PredictionInterface

from region import Region

from bson.objectid import ObjectId
from event import Event

from datetime import datetime
import random

n = 300
ei = EventInterface()
ei.setCollection("candidate_event_10by10_merged")
events = ei.getAllDocuments()
event_list = []

for event in events:
    event_list.append(event)

random.shuffle(event_list)


ei2 = EventInterface()
ei2.setDB("label")
ei2.setCollection("label_10by10")

i = 0
for event in event_list:
    ei2.saveDocument(event)
示例#39
0
from event_interface import EventInterface
from event_feature import EventFeature
from photo_interface import PhotoInterface
from photo import Photo
from region import Region
from event import Event
from caption_parser import CaptionParser
from stopwords import Stopwords

import operator
import string
import types
import random
import math

ei = EventInterface()
ei.setDB('AmazonMT')
ei.setCollection('candidate_event_25by25_merged')

events = ei.getAllDocuments()

duplicates = 0
for event in events:
    e = Event(event)
    flag = e.removeDuplicatePhotos()
    if flag > 0:
        print e.getPhotoNumber(), e.getActualValue()
        ei.updateDocument(e)
示例#40
0
class Representor:
    def __init__(self, vectorizer=None, db="AmazonMT", collection="candidate_event_25by25_merged"):
        """Given an event, return a list incices of the photos in 'photos' filed 
        which are representative to stands for this cluster
        
        Could overwrite TfidfVectorizer as a parameter so that you could customize
        your own tfidf parameters. 
        see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
        """

        self.ei = EventInterface()
        self.ei.setDB(db)
        self.ei.setCollection(collection)
        self.events = []
        for e in self.ei.getAllDocuments():
            event = Event(e)
            event.selectOnePhotoForOneUser()
            e = event.toJSON()
            self.events.append(e)
        # self.events = [e for e in self.ei.getAllDocuments()]
        self._captions = self._getAllCaptions()

        if vectorizer is None:
            self.vectorizer = TfidfVectorizer(
                max_df=0.05,
                min_df=1,
                strip_accents="ascii",
                smooth_idf=True,
                preprocessor=self._preProcessor,
                sublinear_tf=True,
                norm="l2",
                analyzer="char_wb",
                ngram_range=(4, 4),
                stop_words="english",
            )
        else:
            self.vectorizer = vectorizer
        self.vectorizer.fit_transform(self._captions)

    #        print self.vectorizer.get_feature_names()
    def _preProcessor(self, caption):
        regex = re.compile(r"#\w+")
        match = regex.findall(caption)
        if len(match) >= 5:
            return ""
        else:
            return caption

    def _getAllCaptions(self):
        _captions = []
        for event in self.events:
            _captions += self._getEventCaptions(event)
        return _captions

    def _is_ascii(self, _str):
        return all(ord(c) < 128 for c in _str)

    def _getEventCaptions(self, event):
        """For a given event, return the captions as a list. Note for photo without caption,
        use a None to hold the place"""
        event_captions = []
        for p in event["photos"]:
            try:
                if self._is_ascii(p["caption"]["text"]):
                    event_captions.append(p["caption"]["text"].lower())
                else:
                    event_captions.append("")
            except:
                event_captions.append("")
        return event_captions

    def _cosine_sim(self, a, b):
        return a * b.T

    def getRepresentivePhotos(self, event):

        event_captions = self._getEventCaptions(event)
        event_tfidf = self.vectorizer.transform(event_captions)

        centroid = event_tfidf.mean(axis=0)
        # cosine_similarities = linear_kernel(centroid, event_tfidf).flatten()
        cosine_similarities = np.asarray(self._cosine_sim(centroid, event_tfidf)).flatten()

        most_related_pics = cosine_similarities.argsort()
        photos_to_return = []
        # print event['_id']
        for idx in most_related_pics:
            #            print cosine_similarities[idx], event['photos'][idx]['link']
            photos_to_return.append(event["photos"][idx])
        photos_to_return.reverse()

        return photos_to_return

    def getTfidfVector(self, event):
        voc = self.vectorizer.get_feature_names()
        tf_vec = self.vectorizer.transform(self._getEventCaptions(event)).mean(axis=0)

        nonzeros = np.nonzero(tf_vec)[1]
        res_list = nonzeros.ravel().tolist()[0]

        values = []
        words = []
        for n in res_list:
            words.append(voc[n])
            values.append(tf_vec[0, n])

        return res_list, words, values

    def getCorpusWordsVector(self):
        return self.vectorizer.get_feature_names()
示例#41
0
from prediction_interface import PredictionInterface

from region import Region

from bson.objectid import ObjectId
from event import Event

from datetime import datetime


def getDate(utc_time):
    return repr(datetime.fromtimestamp(int(utc_time)))


ei = EventInterface()
ei.setDB("citybeat")
ei.setCollection("next_week_candidate_event_25by25")

ei2 = EventInterface()
ei2.setDB("citybeat")
ei2.setCollection("next_week_candidate_event_25by25_merged")

events = ei.getAllDocuments().sort("created_time", 1)
for event in events:
    if event["actual_value"] >= 8 and event["zscore"] >= 3.0:
        ei2.addEvent(event)


# region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847}
# utc_time = str(1354728300)<div style="text-align: left"></div>