예제 #1
0
def test():
    ei = EventInterface(collection='instagram_front_end_events')
    cur = ei.getAllDocuments(limit=2)
    for e in cur:
        e = PhotoEvent(e)
        print e.getID()
        print e.getAllPhotoImageUrls()
예제 #2
0
def generateData(biased=True):
    ei = EventInterface()
    ei.setDB("historic_alarm")
    ei.setCollection("labeled_event")
    events = ei.getAllDocuments()

    EventFeature.GenerateArffFileHeader()
    true_events = []
    false_events = []
    for event in events:
        event = EventFeature(event)
        feature_vector = event.extractFeatures(3)
        if feature_vector[-1] == 1:
            true_events.append(feature_vector)
        else:
            false_events.append(feature_vector)

    random.shuffle(false_events)

    for fv in true_events:
        for i in xrange(0, len(fv) - 1):
            print fv[i], ",",
        print fv[-1]

    j = 0
    for fv in false_events:
        for i in xrange(0, len(fv) - 1):
            print fv[i], ",",
        print fv[-1]
        j += 1
        if not biased and j == len(true_events):
            break
예제 #3
0
def generateData(biased=True):
    ei = EventInterface()
    ei.setDB('historic_alarm')
    ei.setCollection('labeled_event')
    events = ei.getAllDocuments()

    EventFeature.GenerateArffFileHeader()
    true_events = []
    false_events = []
    for event in events:
        event = EventFeature(event)
        feature_vector = event.extractFeatures(3)
        if feature_vector[-1] == 1:
            true_events.append(feature_vector)
        else:
            false_events.append(feature_vector)

    random.shuffle(false_events)

    for fv in true_events:
        for i in xrange(0, len(fv) - 1):
            print fv[i], ',',
        print fv[-1]

    j = 0
    for fv in false_events:
        for i in xrange(0, len(fv) - 1):
            print fv[i], ',',
        print fv[-1]
        j += 1
        if not biased and j == len(true_events):
            break
예제 #4
0
	def buildCorpusOnDB(self, db, collection):
		ei = EventInterface()
		ei.setDB(db)
		ei.setCollection(collection)
		events = ei.getAllDocuments()
		for event in events:
			word_list = self.getWordList(event)
			self._addDocument(word_list)
예제 #5
0
def main():
    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25_merged')
    events = ei.getAllDocuments()

    event = ei.getEventByID('511478c8c2a3754cfe6684a9')
    print event['region']

    lat = (event['region']['min_lat'] + event['region']['max_lat']) / 2
    lon = (event['region']['min_lng'] + event['region']['max_lng']) / 2
    fid1 = open('region_cache/25_25.txt', 'r')
    for line in fid1:
        cor = line.split(' ')
        for i in xrange(len(cor)):
            cor[i] = float(cor[i])
        if float(cor[0]) <= lat and lat <= float(cor[2]) and float(
                cor[1]) <= lon and lon <= float(cor[3]):
            min_lat = cor[0]
            max_lat = cor[2]
            min_lng = cor[1]
            max_lng = cor[3]
            print min_lat, max_lat, min_lng, max_lng
            break
    fid1.close()

    fid2 = open('labeled_data_cf/181_positive.txt', 'r')

    labels = {}

    for line in fid2:
        t = line.split(',')
        labels[str(t[0])] = int(t[1])
    fid2.close()

    pos = 0
    tot = 0
    for event in events:
        region = event['region']
        id = str(event['_id'])
        if id not in labels.keys():
            continue

        if (floatEqual(region['min_lat'], min_lat)
                and floatEqual(region['max_lat'], max_lat)
                and floatEqual(region['min_lng'], min_lng)
                and floatEqual(region['max_lng'], max_lng)):
            tot += 1
            if labels[id] == 1:
                pos += 1
                print id
    print pos
    print tot
예제 #6
0
def testWithMerge():
    ei = EventInterface()
    ei.setDB('citybeat')
    ei.setCollection('candidate_event_25by25')

    ei2 = EventInterface()
    ei2.setDB('test')
    ei2.setCollection('candidate_event')

    cur = ei.getAllDocuments()
    for event in cur:
        ei2.addEvent(event)
예제 #7
0
def mergeBaselineEvents():
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('baseline_candidate_events')
	
	ei2 = EventInterface()
	ei2.setDB('citybeat')
	ei2.setCollection('baseline_candidate_events_merged')
	
	events = ei.getAllDocuments()
	
	for event in events:
		ei2.addEvent(event)
예제 #8
0
def main():
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('candidate_event_25by25_merged')
	events = ei.getAllDocuments()
	
	
	event = ei.getEventByID('511478c8c2a3754cfe6684a9')
	print event['region']
		
	lat = (event['region']['min_lat'] + event['region']['max_lat'])/2
	lon = (event['region']['min_lng'] + event['region']['max_lng'])/2
	fid1 = open('region_cache/25_25.txt', 'r')
	for line in fid1:
		cor = line.split(' ')
		for i in xrange(len(cor)):
			cor[i] = float(cor[i])
		if float(cor[0]) <= lat and lat <= float(cor[2]) and float(cor[1]) <= lon and lon <= float(cor[3]):
			min_lat = cor[0]
			max_lat = cor[2]
			min_lng = cor[1]
			max_lng = cor[3]
			print min_lat, max_lat, min_lng, max_lng
			break
	fid1.close()
	
	fid2 = open('labeled_data_cf/181_positive.txt', 'r')
		
	labels = {}
	
	for line in fid2:
		t = line.split(',')
		labels[str(t[0])] = int(t[1])
	fid2.close()
	
	pos = 0
	tot = 0
	for event in events:
		region = event['region']
		id = str(event['_id'])
		if id not in labels.keys():
			continue
		
		if (floatEqual(region['min_lat'], min_lat) and floatEqual(region['max_lat'], max_lat)
		   and floatEqual(region['min_lng'], min_lng) and floatEqual(region['max_lng'], max_lng)):
		  tot += 1
		  if labels[id] == 1:
		  	pos += 1
		  	print id
	print pos
	print tot
예제 #9
0
def testWithTweet():
    cnt = 0
    corpus_all = buildAllCorpus(element_type='tweets', debug=False)
    ei = EventInterface()
    ei.setDB('citybeat_experiment')
    ei.setCollection('twitter_candidate_events')
    cur = ei.getAllDocuments()
    print TwitterFeature.GenerateArffFileHeader()
    for event in cur:
        region = Region(event['region'])
        event = TwitterFeature(event, corpus=corpus_all[region.getKey()])
        if event.getActualValue() < 8:
            print '< 8'
            continue
        cnt += 1
        print event.extractFeatures()
    print  cnt, cur.count()
예제 #10
0
def getBaselineEvents():
	ei = EventInterface()
	ei.setDB('citybeat')
	ei.setCollection('baseline_candidate_events')
	
	events = ei.getAllDocuments()
	
	event_list = []
	
	for event in events:
		e = Event(event)
		if e.getActualValue() < 8 or e.getZscore() < 3:
			continue
		event_list.append(event)
	
#	print len(event_list)
	
#	return 
	
	random.shuffle(event_list)
	
	for i in xrange(50):
		print event_list[i]['_id']	
예제 #11
0
from event_interface import EventInterface


def getDate(utc_time):
    return repr(datetime.fromtimestamp(int(utc_time)))


ei = EventInterface()
ei.setDB('citybeat')
ei.setCollection('next_week_candidate_event_25by25')

ei2 = EventInterface()
ei2.setDB('citybeat')
ei2.setCollection('next_week_candidate_event_25by25_merged')

events = ei.getAllDocuments().sort('created_time', 1)
for event in events:
    if event['actual_value'] >= 8 and event['zscore'] >= 3.0:
        ei2.addEvent(event)



        #region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847}
        #utc_time = str(1354728300)<div style="text-align: left"></div>

        #region = {'min_lat': 40.730320599999999, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.743583800000003}
        #utc_time = str(1354340400)
        #
        #condition = ({'region.min_lat':region['min_lat'],
        #                 'region.min_lng':region['min_lng'],
        #                 'region.max_lat':region['max_lat'],
예제 #12
0
class Representor():
    def __init__(self, vectorizer = None, db='AmazonMT', collection='candidate_event_25by25_merged'):
        """Given an event, return a list incices of the photos in 'photos' filed 
        which are representative to stands for this cluster
        
        Could overwrite TfidfVectorizer as a parameter so that you could customize
        your own tfidf parameters. 
        see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
        """
        
        self.ei = EventInterface()
        self.ei.setDB(db)
        self.ei.setCollection(collection)
        self.events = []
        for e in self.ei.getAllDocuments():
            event = Event(e)
            event.selectOnePhotoForOneUser()
            e = event.toJSON()
            self.events.append(e)
        #self.events = [e for e in self.ei.getAllDocuments()]
        self._captions = self._getAllCaptions()
        
        if vectorizer is None:
            self.vectorizer = TfidfVectorizer( max_df=0.05, min_df = 1, strip_accents='ascii', smooth_idf=True, preprocessor = self._preProcessor, sublinear_tf=True, norm = 'l2', analyzer='char_wb', ngram_range=(4,4), stop_words = 'english')
        else:
            self.vectorizer = vectorizer
        self.vectorizer.fit_transform(self._captions)
#        print self.vectorizer.get_feature_names()
    def _preProcessor(self, caption):
        regex = re.compile(r"#\w+")
        match = regex.findall(caption)
        if len(match)>=5:
            return ""
        else:
            return caption

    def _getAllCaptions(self):
        _captions = []
        for event in self.events:
            _captions += self._getEventCaptions(event)
        return _captions

    def _is_ascii(self, _str):
        return all(ord(c) < 128 for c in _str)

    def _getEventCaptions(self, event):
        """For a given event, return the captions as a list. Note for photo without caption,
        use a None to hold the place"""
        event_captions = []
        for p in event['photos']:
            try:
                if self._is_ascii(p['caption']['text']):
                    event_captions.append( p['caption']['text'].lower() )
                else:
                    event_captions.append("")
            except:
                event_captions.append( "" )
        return event_captions 
    def _cosine_sim(self, a, b):
        return a*b.T
    
    def getRepresentivePhotos(self, event):
        
        event_captions = self._getEventCaptions(event)
        event_tfidf = self.vectorizer.transform(event_captions)
        
        centroid = event_tfidf.mean(axis=0)
        #cosine_similarities = linear_kernel(centroid, event_tfidf).flatten()
        cosine_similarities = np.asarray(self._cosine_sim(centroid, event_tfidf)).flatten()

        most_related_pics = cosine_similarities.argsort()
        photos_to_return = []
        #print event['_id']
        for idx in most_related_pics:
#            print cosine_similarities[idx], event['photos'][idx]['link']
            photos_to_return.append( event['photos'][idx] )
        photos_to_return.reverse() 

        return photos_to_return 

    def getTfidfVector(self, event):
        voc = self.vectorizer.get_feature_names()
        tf_vec = self.vectorizer.transform(self._getEventCaptions(event)).mean(axis=0)

        nonzeros = np.nonzero(tf_vec)[1]
        res_list = nonzeros.ravel().tolist()[0] 

        values = []
        words = []
        for n in res_list:
            words.append( voc[n] )
            values.append( tf_vec[0,n] )

        return res_list, words, values

    def getCorpusWordsVector(self):
        return self.vectorizer.get_feature_names()
예제 #13
0
class Representor:
    def __init__(self, vectorizer=None, db="AmazonMT", collection="candidate_event_25by25_merged"):
        """Given an event, return a list incices of the photos in 'photos' filed 
        which are representative to stands for this cluster
        
        Could overwrite TfidfVectorizer as a parameter so that you could customize
        your own tfidf parameters. 
        see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
        """

        self.ei = EventInterface()
        self.ei.setDB(db)
        self.ei.setCollection(collection)
        self.events = []
        for e in self.ei.getAllDocuments():
            event = Event(e)
            event.selectOnePhotoForOneUser()
            e = event.toJSON()
            self.events.append(e)
        # self.events = [e for e in self.ei.getAllDocuments()]
        self._captions = self._getAllCaptions()

        if vectorizer is None:
            self.vectorizer = TfidfVectorizer(
                max_df=0.05,
                min_df=1,
                strip_accents="ascii",
                smooth_idf=True,
                preprocessor=self._preProcessor,
                sublinear_tf=True,
                norm="l2",
                analyzer="char_wb",
                ngram_range=(4, 4),
                stop_words="english",
            )
        else:
            self.vectorizer = vectorizer
        self.vectorizer.fit_transform(self._captions)

    #        print self.vectorizer.get_feature_names()
    def _preProcessor(self, caption):
        regex = re.compile(r"#\w+")
        match = regex.findall(caption)
        if len(match) >= 5:
            return ""
        else:
            return caption

    def _getAllCaptions(self):
        _captions = []
        for event in self.events:
            _captions += self._getEventCaptions(event)
        return _captions

    def _is_ascii(self, _str):
        return all(ord(c) < 128 for c in _str)

    def _getEventCaptions(self, event):
        """For a given event, return the captions as a list. Note for photo without caption,
        use a None to hold the place"""
        event_captions = []
        for p in event["photos"]:
            try:
                if self._is_ascii(p["caption"]["text"]):
                    event_captions.append(p["caption"]["text"].lower())
                else:
                    event_captions.append("")
            except:
                event_captions.append("")
        return event_captions

    def _cosine_sim(self, a, b):
        return a * b.T

    def getRepresentivePhotos(self, event):

        event_captions = self._getEventCaptions(event)
        event_tfidf = self.vectorizer.transform(event_captions)

        centroid = event_tfidf.mean(axis=0)
        # cosine_similarities = linear_kernel(centroid, event_tfidf).flatten()
        cosine_similarities = np.asarray(self._cosine_sim(centroid, event_tfidf)).flatten()

        most_related_pics = cosine_similarities.argsort()
        photos_to_return = []
        # print event['_id']
        for idx in most_related_pics:
            #            print cosine_similarities[idx], event['photos'][idx]['link']
            photos_to_return.append(event["photos"][idx])
        photos_to_return.reverse()

        return photos_to_return

    def getTfidfVector(self, event):
        voc = self.vectorizer.get_feature_names()
        tf_vec = self.vectorizer.transform(self._getEventCaptions(event)).mean(axis=0)

        nonzeros = np.nonzero(tf_vec)[1]
        res_list = nonzeros.ravel().tolist()[0]

        values = []
        words = []
        for n in res_list:
            words.append(voc[n])
            values.append(tf_vec[0, n])

        return res_list, words, values

    def getCorpusWordsVector(self):
        return self.vectorizer.get_feature_names()
예제 #14
0
from event import Event

from datetime import datetime

def getDate(utc_time):
	return repr(datetime.fromtimestamp(int(utc_time)))

ei = EventInterface()
ei.setDB('citybeat')
ei.setCollection('next_week_candidate_event_25by25')

ei2 = EventInterface()
ei2.setDB('citybeat')
ei2.setCollection('next_week_candidate_event_25by25_merged')

events = ei.getAllDocuments().sort('created_time', 1)
for event in events:
	if event['actual_value']  >= 8 and event['zscore'] >= 3.0:
		ei2.addEvent(event)



#region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847}
#utc_time = str(1354728300)<div style="text-align: left"></div>

#region = {'min_lat': 40.730320599999999, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.743583800000003}
#utc_time = str(1354340400)
#
#condition = ({'region.min_lat':region['min_lat'],
#		          'region.min_lng':region['min_lng'],
#		          'region.max_lat':region['max_lat'],
예제 #15
0
from event_interface import EventInterface
from bson.objectid import ObjectId
from event_feature import EventFeature

ei = EventInterface()
ei.setDB('historic_alarm')
ei.setCollection('raw_event')

ei2 = EventInterface()
ei2.setDB('historic_alarm')
ei2.setCollection('labeled_event')

#fid = open('final_labels.txt', 'r')
#
#for line in fid:
#	vals = line.split()
#	label = -1
#	if len(vals) > 1 and vals[1] == '1':
#		label = 1
#	event = ei.getDocument({'_id':ObjectId(vals[0])})
#	event['label'] = label
#	ei2.updateDocument(event)

events = ei2.getAllDocuments()
for event in events:
    label = event['label']
    event = EventFeature(event)
    (lat, lng) = event._getPhotoAvgLocation()
    print lat, lng, label
예제 #16
0
from bson.objectid import ObjectId
from event_feature import EventFeature


ei = EventInterface()
ei.setDB('historic_alarm')
ei.setCollection('raw_event')


ei2 = EventInterface()
ei2.setDB('historic_alarm')
ei2.setCollection('labeled_event')


#fid = open('final_labels.txt', 'r')
#
#for line in fid:
#	vals = line.split()
#	label = -1
#	if len(vals) > 1 and vals[1] == '1':
#		label = 1
#	event = ei.getDocument({'_id':ObjectId(vals[0])})
#	event['label'] = label
#	ei2.updateDocument(event)

events = ei2.getAllDocuments()
for event in events:
	label = event['label']
	event = EventFeature(event)
	(lat, lng) = event._getPhotoAvgLocation()
	print lat, lng, label
예제 #17
0
파일: merge_db.py 프로젝트: oeddyo/CityBeat
from datetime import datetime


def getDate(utc_time):
    return repr(datetime.fromtimestamp(int(utc_time)))


ei = EventInterface()
ei.setDB("citybeat")
ei.setCollection("next_week_candidate_event_25by25")

ei2 = EventInterface()
ei2.setDB("citybeat")
ei2.setCollection("next_week_candidate_event_25by25_merged")

events = ei.getAllDocuments().sort("created_time", 1)
for event in events:
    if event["actual_value"] >= 8 and event["zscore"] >= 3.0:
        ei2.addEvent(event)


# region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847}
# utc_time = str(1354728300)<div style="text-align: left"></div>

# region = {'min_lat': 40.730320599999999, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.743583800000003}
# utc_time = str(1354340400)
#
# condition = ({'region.min_lat':region['min_lat'],
# 		          'region.min_lng':region['min_lng'],
# 		          'region.max_lat':region['max_lat'],
# 		          'region.max_lng':region['max_lng']})