def test(): ei = EventInterface(collection='instagram_front_end_events') cur = ei.getAllDocuments(limit=2) for e in cur: e = PhotoEvent(e) print e.getID() print e.getAllPhotoImageUrls()
def generateData(biased=True): ei = EventInterface() ei.setDB("historic_alarm") ei.setCollection("labeled_event") events = ei.getAllDocuments() EventFeature.GenerateArffFileHeader() true_events = [] false_events = [] for event in events: event = EventFeature(event) feature_vector = event.extractFeatures(3) if feature_vector[-1] == 1: true_events.append(feature_vector) else: false_events.append(feature_vector) random.shuffle(false_events) for fv in true_events: for i in xrange(0, len(fv) - 1): print fv[i], ",", print fv[-1] j = 0 for fv in false_events: for i in xrange(0, len(fv) - 1): print fv[i], ",", print fv[-1] j += 1 if not biased and j == len(true_events): break
def generateData(biased=True): ei = EventInterface() ei.setDB('historic_alarm') ei.setCollection('labeled_event') events = ei.getAllDocuments() EventFeature.GenerateArffFileHeader() true_events = [] false_events = [] for event in events: event = EventFeature(event) feature_vector = event.extractFeatures(3) if feature_vector[-1] == 1: true_events.append(feature_vector) else: false_events.append(feature_vector) random.shuffle(false_events) for fv in true_events: for i in xrange(0, len(fv) - 1): print fv[i], ',', print fv[-1] j = 0 for fv in false_events: for i in xrange(0, len(fv) - 1): print fv[i], ',', print fv[-1] j += 1 if not biased and j == len(true_events): break
def buildCorpusOnDB(self, db, collection): ei = EventInterface() ei.setDB(db) ei.setCollection(collection) events = ei.getAllDocuments() for event in events: word_list = self.getWordList(event) self._addDocument(word_list)
def main(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') events = ei.getAllDocuments() event = ei.getEventByID('511478c8c2a3754cfe6684a9') print event['region'] lat = (event['region']['min_lat'] + event['region']['max_lat']) / 2 lon = (event['region']['min_lng'] + event['region']['max_lng']) / 2 fid1 = open('region_cache/25_25.txt', 'r') for line in fid1: cor = line.split(' ') for i in xrange(len(cor)): cor[i] = float(cor[i]) if float(cor[0]) <= lat and lat <= float(cor[2]) and float( cor[1]) <= lon and lon <= float(cor[3]): min_lat = cor[0] max_lat = cor[2] min_lng = cor[1] max_lng = cor[3] print min_lat, max_lat, min_lng, max_lng break fid1.close() fid2 = open('labeled_data_cf/181_positive.txt', 'r') labels = {} for line in fid2: t = line.split(',') labels[str(t[0])] = int(t[1]) fid2.close() pos = 0 tot = 0 for event in events: region = event['region'] id = str(event['_id']) if id not in labels.keys(): continue if (floatEqual(region['min_lat'], min_lat) and floatEqual(region['max_lat'], max_lat) and floatEqual(region['min_lng'], min_lng) and floatEqual(region['max_lng'], max_lng)): tot += 1 if labels[id] == 1: pos += 1 print id print pos print tot
def testWithMerge(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25') ei2 = EventInterface() ei2.setDB('test') ei2.setCollection('candidate_event') cur = ei.getAllDocuments() for event in cur: ei2.addEvent(event)
def mergeBaselineEvents(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('baseline_candidate_events') ei2 = EventInterface() ei2.setDB('citybeat') ei2.setCollection('baseline_candidate_events_merged') events = ei.getAllDocuments() for event in events: ei2.addEvent(event)
def main(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') events = ei.getAllDocuments() event = ei.getEventByID('511478c8c2a3754cfe6684a9') print event['region'] lat = (event['region']['min_lat'] + event['region']['max_lat'])/2 lon = (event['region']['min_lng'] + event['region']['max_lng'])/2 fid1 = open('region_cache/25_25.txt', 'r') for line in fid1: cor = line.split(' ') for i in xrange(len(cor)): cor[i] = float(cor[i]) if float(cor[0]) <= lat and lat <= float(cor[2]) and float(cor[1]) <= lon and lon <= float(cor[3]): min_lat = cor[0] max_lat = cor[2] min_lng = cor[1] max_lng = cor[3] print min_lat, max_lat, min_lng, max_lng break fid1.close() fid2 = open('labeled_data_cf/181_positive.txt', 'r') labels = {} for line in fid2: t = line.split(',') labels[str(t[0])] = int(t[1]) fid2.close() pos = 0 tot = 0 for event in events: region = event['region'] id = str(event['_id']) if id not in labels.keys(): continue if (floatEqual(region['min_lat'], min_lat) and floatEqual(region['max_lat'], max_lat) and floatEqual(region['min_lng'], min_lng) and floatEqual(region['max_lng'], max_lng)): tot += 1 if labels[id] == 1: pos += 1 print id print pos print tot
def testWithTweet(): cnt = 0 corpus_all = buildAllCorpus(element_type='tweets', debug=False) ei = EventInterface() ei.setDB('citybeat_experiment') ei.setCollection('twitter_candidate_events') cur = ei.getAllDocuments() print TwitterFeature.GenerateArffFileHeader() for event in cur: region = Region(event['region']) event = TwitterFeature(event, corpus=corpus_all[region.getKey()]) if event.getActualValue() < 8: print '< 8' continue cnt += 1 print event.extractFeatures() print cnt, cur.count()
def getBaselineEvents(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('baseline_candidate_events') events = ei.getAllDocuments() event_list = [] for event in events: e = Event(event) if e.getActualValue() < 8 or e.getZscore() < 3: continue event_list.append(event) # print len(event_list) # return random.shuffle(event_list) for i in xrange(50): print event_list[i]['_id']
from event_interface import EventInterface def getDate(utc_time): return repr(datetime.fromtimestamp(int(utc_time))) ei = EventInterface() ei.setDB('citybeat') ei.setCollection('next_week_candidate_event_25by25') ei2 = EventInterface() ei2.setDB('citybeat') ei2.setCollection('next_week_candidate_event_25by25_merged') events = ei.getAllDocuments().sort('created_time', 1) for event in events: if event['actual_value'] >= 8 and event['zscore'] >= 3.0: ei2.addEvent(event) #region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847} #utc_time = str(1354728300)<div style="text-align: left"></div> #region = {'min_lat': 40.730320599999999, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.743583800000003} #utc_time = str(1354340400) # #condition = ({'region.min_lat':region['min_lat'], # 'region.min_lng':region['min_lng'], # 'region.max_lat':region['max_lat'],
class Representor(): def __init__(self, vectorizer = None, db='AmazonMT', collection='candidate_event_25by25_merged'): """Given an event, return a list incices of the photos in 'photos' filed which are representative to stands for this cluster Could overwrite TfidfVectorizer as a parameter so that you could customize your own tfidf parameters. see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html """ self.ei = EventInterface() self.ei.setDB(db) self.ei.setCollection(collection) self.events = [] for e in self.ei.getAllDocuments(): event = Event(e) event.selectOnePhotoForOneUser() e = event.toJSON() self.events.append(e) #self.events = [e for e in self.ei.getAllDocuments()] self._captions = self._getAllCaptions() if vectorizer is None: self.vectorizer = TfidfVectorizer( max_df=0.05, min_df = 1, strip_accents='ascii', smooth_idf=True, preprocessor = self._preProcessor, sublinear_tf=True, norm = 'l2', analyzer='char_wb', ngram_range=(4,4), stop_words = 'english') else: self.vectorizer = vectorizer self.vectorizer.fit_transform(self._captions) # print self.vectorizer.get_feature_names() def _preProcessor(self, caption): regex = re.compile(r"#\w+") match = regex.findall(caption) if len(match)>=5: return "" else: return caption def _getAllCaptions(self): _captions = [] for event in self.events: _captions += self._getEventCaptions(event) return _captions def _is_ascii(self, _str): return all(ord(c) < 128 for c in _str) def _getEventCaptions(self, event): """For a given event, return the captions as a list. Note for photo without caption, use a None to hold the place""" event_captions = [] for p in event['photos']: try: if self._is_ascii(p['caption']['text']): event_captions.append( p['caption']['text'].lower() ) else: event_captions.append("") except: event_captions.append( "" ) return event_captions def _cosine_sim(self, a, b): return a*b.T def getRepresentivePhotos(self, event): event_captions = self._getEventCaptions(event) event_tfidf = self.vectorizer.transform(event_captions) centroid = event_tfidf.mean(axis=0) #cosine_similarities = linear_kernel(centroid, event_tfidf).flatten() cosine_similarities = np.asarray(self._cosine_sim(centroid, event_tfidf)).flatten() most_related_pics = cosine_similarities.argsort() photos_to_return = [] #print event['_id'] for idx in most_related_pics: # print cosine_similarities[idx], event['photos'][idx]['link'] photos_to_return.append( event['photos'][idx] ) photos_to_return.reverse() return photos_to_return def getTfidfVector(self, event): voc = self.vectorizer.get_feature_names() tf_vec = self.vectorizer.transform(self._getEventCaptions(event)).mean(axis=0) nonzeros = np.nonzero(tf_vec)[1] res_list = nonzeros.ravel().tolist()[0] values = [] words = [] for n in res_list: words.append( voc[n] ) values.append( tf_vec[0,n] ) return res_list, words, values def getCorpusWordsVector(self): return self.vectorizer.get_feature_names()
class Representor: def __init__(self, vectorizer=None, db="AmazonMT", collection="candidate_event_25by25_merged"): """Given an event, return a list incices of the photos in 'photos' filed which are representative to stands for this cluster Could overwrite TfidfVectorizer as a parameter so that you could customize your own tfidf parameters. see http://scikit-learn.org/dev/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html """ self.ei = EventInterface() self.ei.setDB(db) self.ei.setCollection(collection) self.events = [] for e in self.ei.getAllDocuments(): event = Event(e) event.selectOnePhotoForOneUser() e = event.toJSON() self.events.append(e) # self.events = [e for e in self.ei.getAllDocuments()] self._captions = self._getAllCaptions() if vectorizer is None: self.vectorizer = TfidfVectorizer( max_df=0.05, min_df=1, strip_accents="ascii", smooth_idf=True, preprocessor=self._preProcessor, sublinear_tf=True, norm="l2", analyzer="char_wb", ngram_range=(4, 4), stop_words="english", ) else: self.vectorizer = vectorizer self.vectorizer.fit_transform(self._captions) # print self.vectorizer.get_feature_names() def _preProcessor(self, caption): regex = re.compile(r"#\w+") match = regex.findall(caption) if len(match) >= 5: return "" else: return caption def _getAllCaptions(self): _captions = [] for event in self.events: _captions += self._getEventCaptions(event) return _captions def _is_ascii(self, _str): return all(ord(c) < 128 for c in _str) def _getEventCaptions(self, event): """For a given event, return the captions as a list. Note for photo without caption, use a None to hold the place""" event_captions = [] for p in event["photos"]: try: if self._is_ascii(p["caption"]["text"]): event_captions.append(p["caption"]["text"].lower()) else: event_captions.append("") except: event_captions.append("") return event_captions def _cosine_sim(self, a, b): return a * b.T def getRepresentivePhotos(self, event): event_captions = self._getEventCaptions(event) event_tfidf = self.vectorizer.transform(event_captions) centroid = event_tfidf.mean(axis=0) # cosine_similarities = linear_kernel(centroid, event_tfidf).flatten() cosine_similarities = np.asarray(self._cosine_sim(centroid, event_tfidf)).flatten() most_related_pics = cosine_similarities.argsort() photos_to_return = [] # print event['_id'] for idx in most_related_pics: # print cosine_similarities[idx], event['photos'][idx]['link'] photos_to_return.append(event["photos"][idx]) photos_to_return.reverse() return photos_to_return def getTfidfVector(self, event): voc = self.vectorizer.get_feature_names() tf_vec = self.vectorizer.transform(self._getEventCaptions(event)).mean(axis=0) nonzeros = np.nonzero(tf_vec)[1] res_list = nonzeros.ravel().tolist()[0] values = [] words = [] for n in res_list: words.append(voc[n]) values.append(tf_vec[0, n]) return res_list, words, values def getCorpusWordsVector(self): return self.vectorizer.get_feature_names()
from event import Event from datetime import datetime def getDate(utc_time): return repr(datetime.fromtimestamp(int(utc_time))) ei = EventInterface() ei.setDB('citybeat') ei.setCollection('next_week_candidate_event_25by25') ei2 = EventInterface() ei2.setDB('citybeat') ei2.setCollection('next_week_candidate_event_25by25_merged') events = ei.getAllDocuments().sort('created_time', 1) for event in events: if event['actual_value'] >= 8 and event['zscore'] >= 3.0: ei2.addEvent(event) #region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847} #utc_time = str(1354728300)<div style="text-align: left"></div> #region = {'min_lat': 40.730320599999999, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.743583800000003} #utc_time = str(1354340400) # #condition = ({'region.min_lat':region['min_lat'], # 'region.min_lng':region['min_lng'], # 'region.max_lat':region['max_lat'],
from event_interface import EventInterface from bson.objectid import ObjectId from event_feature import EventFeature ei = EventInterface() ei.setDB('historic_alarm') ei.setCollection('raw_event') ei2 = EventInterface() ei2.setDB('historic_alarm') ei2.setCollection('labeled_event') #fid = open('final_labels.txt', 'r') # #for line in fid: # vals = line.split() # label = -1 # if len(vals) > 1 and vals[1] == '1': # label = 1 # event = ei.getDocument({'_id':ObjectId(vals[0])}) # event['label'] = label # ei2.updateDocument(event) events = ei2.getAllDocuments() for event in events: label = event['label'] event = EventFeature(event) (lat, lng) = event._getPhotoAvgLocation() print lat, lng, label
from bson.objectid import ObjectId from event_feature import EventFeature ei = EventInterface() ei.setDB('historic_alarm') ei.setCollection('raw_event') ei2 = EventInterface() ei2.setDB('historic_alarm') ei2.setCollection('labeled_event') #fid = open('final_labels.txt', 'r') # #for line in fid: # vals = line.split() # label = -1 # if len(vals) > 1 and vals[1] == '1': # label = 1 # event = ei.getDocument({'_id':ObjectId(vals[0])}) # event['label'] = label # ei2.updateDocument(event) events = ei2.getAllDocuments() for event in events: label = event['label'] event = EventFeature(event) (lat, lng) = event._getPhotoAvgLocation() print lat, lng, label
from datetime import datetime def getDate(utc_time): return repr(datetime.fromtimestamp(int(utc_time))) ei = EventInterface() ei.setDB("citybeat") ei.setCollection("next_week_candidate_event_25by25") ei2 = EventInterface() ei2.setDB("citybeat") ei2.setCollection("next_week_candidate_event_25by25_merged") events = ei.getAllDocuments().sort("created_time", 1) for event in events: if event["actual_value"] >= 8 and event["zscore"] >= 3.0: ei2.addEvent(event) # region= {'min_lat': 40.743583800000003, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.756847} # utc_time = str(1354728300)<div style="text-align: left"></div> # region = {'min_lat': 40.730320599999999, 'max_lng': -73.978088200000002, 'min_lng': -73.998103900000004, 'max_lat': 40.743583800000003} # utc_time = str(1354340400) # # condition = ({'region.min_lat':region['min_lat'], # 'region.min_lng':region['min_lng'], # 'region.max_lat':region['max_lat'], # 'region.max_lng':region['max_lng']})