def loadNextWeekData(): # load modified ei = EventInterface() ei.setDB('citybeat') ei.setCollection('next_week_candidate_event_25by25_merged') true_events = [] false_events = [] fid2 = open('labeled_data_cf/label_next_week.txt', 'r') for line in fid2: t = line.split(',') id = str(t[0]) label = int(t[1]) event = ei.getDocument({'_id':ObjectId(id)}) event['label'] = label e = Event(event) if e.getActualValue() < 8 or event['label'] == 0: # print 'bad event ' + id continue if event['label'] == 1: true_events.append(event) else: false_events.append(event) fid2.close() return true_events, false_events
def loadNextWeekData(): # load modified ei = EventInterface() ei.setDB('citybeat') ei.setCollection('next_week_candidate_event_25by25_merged') true_events = [] false_events = [] fid2 = open('labeled_data_cf/label_next_week.txt', 'r') for line in fid2: t = line.split(',') id = str(t[0]) label = int(t[1]) event = ei.getDocument({'_id': ObjectId(id)}) event['label'] = label e = Event(event) if e.getActualValue() < 8 or event['label'] == 0: # print 'bad event ' + id continue if event['label'] == 1: true_events.append(event) else: false_events.append(event) fid2.close() return true_events, false_events
def loadUnbalancedData(_182): # load modified ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') true_events = [] false_events = [] if _182: fid2 = open('labeled_data_cf/182_positive.txt', 'r') else: fid2 = open('labeled_data_cf/181_positive.txt', 'r') modified_events = {} for line in fid2: t = line.split(',') modified_events[str(t[0])] = int(t[1]) fid2.close() # put the data into a text file first fid = open('labeled_data_cf/data2.txt','r') for line in fid: if len(line.strip()) == 0: continue t = line.strip().split() if not len(t) == 3: continue label = t[0].lower() confidence = float(t[1]) event_id = str(t[2].split('/')[-1]) if label == 'not_sure': continue if label == 'yes': label = 1 else: label = -1 event = ei.getDocument({'_id':ObjectId(event_id)}) event['label'] = label if modified_events.has_key(event_id): event['label'] = modified_events[event_id] e = Event(event) if e.getActualValue() < 8 or event['label'] == 0: # print 'bad event ' + id continue if event['label'] == 1: true_events.append(event) else: if event['label'] == -1 and confidence == 1: false_events.append(event) fid.close() return true_events, false_events
def loadUnbalancedData(_182): # load modified ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') true_events = [] false_events = [] if _182: fid2 = open('labeled_data_cf/182_positive.txt', 'r') else: fid2 = open('labeled_data_cf/181_positive.txt', 'r') modified_events = {} for line in fid2: t = line.split(',') modified_events[str(t[0])] = int(t[1]) fid2.close() # put the data into a text file first fid = open('labeled_data_cf/data2.txt', 'r') for line in fid: if len(line.strip()) == 0: continue t = line.strip().split() if not len(t) == 3: continue label = t[0].lower() confidence = float(t[1]) event_id = str(t[2].split('/')[-1]) if label == 'not_sure': continue if label == 'yes': label = 1 else: label = -1 event = ei.getDocument({'_id': ObjectId(event_id)}) event['label'] = label if modified_events.has_key(event_id): event['label'] = modified_events[event_id] e = Event(event) if e.getActualValue() < 8 or event['label'] == 0: # print 'bad event ' + id continue if event['label'] == 1: true_events.append(event) else: if event['label'] == -1 and confidence == 1: false_events.append(event) fid.close() return true_events, false_events
def getAllActualEvents(): ei = EventInterface() ei.setDB("citybeat") ei.setCollection("candidate_event_25by25_merged") true_events = [] false_events = [] fid2 = open("labeled_data_cf/181_positive.txt", "r") modified_events = {} for line in fid2: t = line.split(",") modified_events[str(t[0])] = int(t[1]) fid2.close() # put the data into a text file first fid = open("labeled_data_cf/data2.txt", "r") for line in fid: if len(line.strip()) == 0: continue t = line.strip().split() if not len(t) == 3: continue label = t[0].lower() confidence = float(t[1]) event_id = str(t[2].split("/")[-1]) if label == "not_sure": continue if label == "yes": label = 1 else: label = -1 event = ei.getDocument({"_id": ObjectId(event_id)}) event["label"] = label if modified_events.has_key(event_id): event["label"] = modified_events[event_id] e = Event(event) if e.getActualValue() < 8 or event["label"] == 0: # print 'bad event ' + id continue if event["label"] == 1: true_events.append(event) fid.close() return true_events
def getBaselineEvents(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('baseline_candidate_events') events = ei.getAllDocuments() event_list = [] for event in events: e = Event(event) if e.getActualValue() < 8 or e.getZscore() < 3: continue event_list.append(event) # print len(event_list) # return random.shuffle(event_list) for i in xrange(50): print event_list[i]['_id']
def generateTrueLabelFile(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') events = {} fid1 = open('labeled_data_cf/181_positive.txt', 'r') true_events = [] false_events = [] unknown_events = [] for line in fid1: t = line.split(',') id = str(t[0]) label = int(t[1]) events[id] = label fid1.close() for id, label in events.items(): event = ei.getDocument({'_id':ObjectId(id)}) event['label'] = label e = Event(event) if e.getActualValue() < 8: # print 'bad event ' + id continue if event['label'] == -1: false_events.append(event) else: if event['label'] == 1: true_events.append(event) else: unknown_events.append(event) for event in true_events + false_events + unknown_events: print str(event['_id'])+','+str(event['label'])
def generateTrueLabelFile(): ei = EventInterface() ei.setDB('citybeat') ei.setCollection('candidate_event_25by25_merged') events = {} fid1 = open('labeled_data_cf/181_positive.txt', 'r') true_events = [] false_events = [] unknown_events = [] for line in fid1: t = line.split(',') id = str(t[0]) label = int(t[1]) events[id] = label fid1.close() for id, label in events.items(): event = ei.getDocument({'_id': ObjectId(id)}) event['label'] = label e = Event(event) if e.getActualValue() < 8: # print 'bad event ' + id continue if event['label'] == -1: false_events.append(event) else: if event['label'] == 1: true_events.append(event) else: unknown_events.append(event) for event in true_events + false_events + unknown_events: print str(event['_id']) + ',' + str(event['label'])
from event_interface import EventInterface from event_feature import EventFeature from photo_interface import PhotoInterface from photo import Photo from region import Region from event import Event from caption_parser import CaptionParser from stopwords import Stopwords import operator import string import types import random import math ei = EventInterface() ei.setDB('AmazonMT') ei.setCollection('candidate_event_25by25_merged') events = ei.getAllDocuments() duplicates = 0 for event in events: e = Event(event) flag = e.removeDuplicatePhotos() if flag > 0: print e.getPhotoNumber(), e.getActualValue() ei.updateDocument(e)