def run(self): # read in new events with open(self.in_events().path, 'r', encoding='utf-8') as file_in: new_eventdicts = json.loads(file_in.read()) new_event_objs = [] for ed in new_eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed, txt=False) new_event_objs.append(eventobj) earliest_date = min([event.datetime for event in new_event_objs]) # read in current events with open(self.current_events, 'r', encoding='utf-8') as file_in: current_eventdicts = json.loads(file_in.read()) current_event_objs = [] current_event_objs_candidates = [] for ed in current_eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed, txt=False) if eventobj.datetime >= earliest_date: current_event_objs_candidates.append(eventobj) else: current_event_objs.append(eventobj) # initialize event merger merger = event_merger.EventMerger() merger.add_events(current_event_objs_candidates) # merge before integration print( 'Merging new events before integration; number of events at start:', len(new_event_objs)) overlap_threshold = float(self.overlap_threshold) premerger = event_merger.EventMerger() premerger.add_events(new_event_objs) premerger.find_merges(overlap_threshold) new_events_merged = premerger.return_events() print('Done. New events after merge:', len(new_events_merged)) # integrate each event into the current ones print('Starting integrating new events; number of current events:', len(current_event_objs)) for new_event in new_events_merged: merger.find_merge(new_event, overlap_threshold) # write merged integrated_events = merger.return_events() + current_event_objs print('Done. Number of events after integration:', len(integrated_events)) out_integrated_events = [ event.return_dict(txt=False) for event in integrated_events ] with open(self.out_integrated_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_integrated_events, file_out)
def run(self): # read in events print('Reading in events') with open(self.in_deduplicated_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) event_objs.append(eventobj) # initialize event enhancer print('Enhancing events') enhancer = event_enhancer.EventEnhancer() enhancer.set_events(event_objs) enhancer.enhance() enhanced_events = enhancer.return_events() # write deduplicated out_enhanced_events = [ event.return_dict() for event in enhanced_events ] with open(self.out_enhanced_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_enhanced_events, file_out)
def run(self): # read in events with open(self.in_enhanced_events().path, 'r', encoding = 'utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) event_objs.append(eventobj) # initialize event merger print('Merging; number of events at start:',len(event_objs)) overlap_threshold = float(self.overlap_threshold) merger = event_merger.EventMerger() merger.add_events(event_objs) merger.find_merges(overlap_threshold) events_merged = merger.return_events() print('Merging again; current number of events:',len(events_merged)) merger2 = event_merger.EventMerger() merger2.add_events(events_merged) merger2.find_merges(overlap_threshold) events_merged_final = merger2.return_events() print('Done. number of events after merge:',len(events_merged_final)) # write merged out_merged_events = [event.return_dict(txt=False) for event in events_merged_final] with open(self.out_merged_events().path,'w',encoding='utf-8') as file_out: json.dump(out_merged_events,file_out)
def run(self): # read in events print('Reading in events') with open(self.in_merged_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed, txt=False) event_objs.append(eventobj) print('Reading in citylist') # read in citylist with open(self.citylist, 'r', encoding='utf-8') as file_in: citylist = [ line.strip() for line in file_in.read().strip().split('\n') ] # initialize event filter print('Filtering; number of events at start:', len(event_objs)) filter = event_filter.EventFilter() filter.add_events(event_objs) filter.apply_filter(citylist) events_filtered = filter.return_events() print('Done. number of events after filter:', len(events_filtered)) # write filter out_filtered_events = [ event.return_dict(txt=False) for event in events_filtered ] with open(self.out_filtered_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_filtered_events, file_out)
def run(self): # read events archive_events = [] active_events = [] date = datetime.datetime(int(self.archivedate[:4]), int(self.archivedate[4:6]), int(self.archivedate[6:8])) print('Reading events') with open(self.in_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) for i, ed in enumerate(eventdicts): eventobj = event.Event() eventobj.import_eventdict(ed) if eventobj.datetime == date: archive_events.append(eventobj) else: active_events.append(eventobj) # write archive print('Writing archive') out_archive_events = [ event.return_dict(txt=False) for event in archive_events ] with open(self.out_archived().path, 'w', encoding='utf-8') as file_out: json.dump(out_archive_events, file_out) # write active events print('Writing active events') out_active_events = [ event.return_dict(txt=False) for event in active_events ] with open(self.out_active_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_active_events, file_out)
def run(self): # read in events with open(self.in_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) event_objs.append(eventobj) # initialize event deduplicator similarity_threshold = float(self.similarity_threshold) print('Deduplicating; number of events at start:', len(event_objs)) deduplicator = event_deduplicator.EventDeduplicator() deduplicator.set_events(event_objs) deduplicator.deduplicate_events(similarity_threshold) deduplicated_events = deduplicator.return_events() print('Done. number of events after deduplication:', len(deduplicated_events)) # write deduplicated out_deduplicated_events = [ event.return_dict() for event in deduplicated_events ] with open(self.out_deduplicated_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_deduplicated_events, file_out)
def run(self): # collect all event files with extension '.enhanced' enhanced_events = glob.glob(self.in_eventdir().path + '/*.enhanced') # initialize merger = event_merger.EventMerger() overlap_threshold = float(self.overlap_threshold) # for each event file for eventfile in enhanced_events: print('Reading', eventfile) with open(eventfile, 'r', encoding='utf-8') as file_in: current_eventdicts = json.loads(file_in.read()) new_event_objs = [] for ed in current_eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed) new_event_objs.append(eventobj) # merge before integration print( 'Merging new events before integration; number of events at start:', len(new_event_objs)) premerger = event_merger.EventMerger() premerger.add_events(new_event_objs) premerger.find_merges(overlap_threshold) new_events_merged = premerger.return_events() print('Done. New events after merge:', len(new_events_merged)) if len(merger.events) == 0: merger.add_events(new_events_merged) else: # integrate each event into the current ones print( 'Starting integrating new events; number of current events:', len(merger.events)) for new_event in new_events_merged: merger.find_merge(new_event, overlap_threshold) # write merged integrated_events = merger.return_events() print('Done. Number of events after integration:', len(integrated_events)) out_integrated_events = [ event.return_dict() for event in integrated_events ] with open(self.out_integrated_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_integrated_events, file_out)
def run(self): # read prediction data with open(self.in_predictiondir().path + '/events_meta.txt','r',encoding='utf=8') as file_in: meta = file_in.read().strip().split('\n') with open(self.in_predictiondir().path + '/events_text.predictions.txt','r',encoding='utf=8') as file_in: predictions = file_in.read().strip().split('\n') with open(self.in_predictiondir().path + '/events_text.full_predictions.txt','r',encoding='utf=8') as file_in: lines = file_in.read().strip().split('\n') label_order = lines[0].split('\t') full_predictions = [line.split('\t') for line in lines[1:]] print('Meta',len(meta)) print('Predictions',len(predictions)) print('Full predictions',len(full_predictions)) # read in events print('Reading in events') with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in: eventdicts = json.loads(file_in.read()) event_objs = [] for ed in eventdicts: eventobj = event.Event() eventobj.import_eventdict(ed,txt=self.text) event_objs.append(eventobj) # index events id_event = {} for eo in event_objs: id_event[eo.mongo_id] = eo # for each prediction for i,mid in enumerate(meta): prediction = predictions[i] prediction_score = dict(zip(label_order,full_predictions[i])) eo = id_event[mid] eo.eventtype = prediction eo.eventtype_scores = prediction_score # write output out_updated_events = [event.return_dict(txt=self.text) for event in event_objs] with open(self.out_updated_events().path,'w',encoding='utf-8') as file_out: json.dump(out_updated_events,file_out)
def run(self): # initiate directory self.setup_output_dir(self.out_archivedir().path) # read events datebound = datetime.now() - datetime.timedelta(days=100) date_events = defaultdict(list) active_events = [] print('Reading events') with open(self.in_events().path, 'r', encoding='utf-8') as file_in: eventdicts = json.loads(file_in.read()) for i, ed in enumerate(eventdicts): eventobj = event.Event() eventobj.import_eventdict(ed) if eventobj.datetime < datebound: date_events[''.join( str(eventobj.datetime).split()[0].split('-'))].append( eventobj) else: active_events.append(eventobj) # write archives print('Writing archives') for date in sorted(list(date_events.keys())): print(date) events = date_events[date] out_events = [event.return_dict(txt=False) for event in events] outfile = self.out_archivedir().path + '/events_' + date + '.json' with open(outfile, 'w', encoding='utf-8') as file_out: json.dump(out_events, file_out) # write active events print('Writing active events') out_active_events = [ event.return_dict(txt=False) for event in active_events ] with open(self.out_active_events().path, 'w', encoding='utf-8') as file_out: json.dump(out_active_events, file_out)
def run(self): first_event_date_dt = datetime.datetime(int(self.first_event_date[:4]),int(self.first_event_date[4:6]),int(self.first_event_date[6:])) last_event_date_dt = datetime.datetime(int(self.last_event_date[:4]),int(self.last_event_date[4:6]),int(self.last_event_date[6:])) # read in burstiness print('Reading in bursty entities') with open(self.in_entity_burstiness().path,'r',encoding='utf-8') as file_in: bursty_entities = file_in.read().strip().split('\n') set_bursty_entities = set(bursty_entities) # read in events term_events = defaultdict(list) print('Reading in events') extended_events = [] with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in: eventdicts = json.loads(file_in.read()) for i,ed in enumerate(eventdicts): eventobj = event.Event() eventobj.import_eventdict(ed) if eventobj.datetime >= first_event_date_dt - datetime.timedelta(days=100) and eventobj.datetime <= last_event_date_dt + datetime.timedelta(days=100): extended_events.append(eventobj) if len(list(set_bursty_entities & set(list(ed['entities'])))) > 0: for term in list(set_bursty_entities & set(list(ed['entities']))): term_events[term].append(eventobj) # for each entity print('Saving event tweets dates by entity') date_entity_events = defaultdict(lambda : defaultdict(list)) for entity in bursty_entities: # for each event if len(term_events[entity]) == 0: continue else: for i,ev in enumerate(term_events[entity]): if ev.datetime >= first_event_date_dt and ev.datetime <= last_event_date_dt: if i == 0: minus = 100 else: minus = 100 if ((ev.datetime - term_events[entity][i-1].datetime).days > 199 or (ev.datetime - term_events[entity][i-1].datetime).days < 3) else ((ev.datetime - term_events[entity][i-1].datetime).days) / 2 if i == len(term_events[entity])-1: plus = 100 else: plus = 100 if ((term_events[entity][i-1].datetime - ev.datetime).days > 199 or (term_events[entity][i-1].datetime - ev.datetime).days < 3) else ((term_events[entity][i-1].datetime - ev.datetime).days) / 2 first = ev.datetime - datetime.timedelta(days=minus) last = ev.datetime + datetime.timedelta(days=plus) cursor = first while cursor <= last: date_str = ''.join(str(cursor).split()[0].split('-')) date_entity_events[date_str][entity].append(ev) cursor += datetime.timedelta(days=1) # read in tweets print('Collecting additional tweets') dates = list(date_entity_events.keys()) months = list(set([d[:6] for d in dates])) tweetsubdirs = sorted([ subdir for subdir in glob.glob(self.in_tweetdir().path + '/*') ]) entity_tweets = defaultdict(list) first = True for tweetsubdir in tweetsubdirs: subdirstr = tweetsubdir.split('/')[-1] if subdirstr in months: # go through all tweet files tweetfiles = [ tweetfile for tweetfile in glob.glob(tweetsubdir + '/*.entity.json') ] for tweetfile in tweetfiles: print(tweetfile) datestr = tweetfile.split('/')[-1].split('.')[0].split('-')[0] if datestr in dates: if first: candidate_entities = list(date_entity_events[datestr].keys()) set_candidate_entities = set(candidate_entities) cursordate = datestr first = False elif datestr != cursordate: # add tweets for entity in candidate_entities: for ev in date_entity_events[datestr][entity]: ev.add_tweets(entity_tweets[entity]) cursordate = datestr candidate_entities = list(date_entity_events[datestr].keys()) set_candidate_entities = set(candidate_entities) entity_tweets = defaultdict(list) # read in tweets with open(tweetfile, 'r', encoding = 'utf-8') as file_in: tweetdicts = json.loads(file_in.read()) for td in tweetdicts: if len(list(set_candidate_entities & set(list(td['entities'].keys())))) > 0: tweetobj = tweet.Tweet() tweetobj.import_tweetdict(td) for term in list(set_candidate_entities & set(list(td['entities'].keys()))): entity_tweets[term].append(tweetobj) # write to file print('Writing new events') out_extended_events = [ev.return_dict() for ev in extended_events] with open(self.out_more_tweets().path,'w',encoding='utf-8') as file_out: json.dump(out_extended_events,file_out)