예제 #1
0
    def run(self):

        # read events
        archive_events = []
        active_events = []
        date = datetime.datetime(int(self.archivedate[:4]),
                                 int(self.archivedate[4:6]),
                                 int(self.archivedate[6:8]))
        print('Reading events')
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
            for i, ed in enumerate(eventdicts):
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                if eventobj.datetime == date:
                    archive_events.append(eventobj)
                else:
                    active_events.append(eventobj)

        # write archive
        print('Writing archive')
        out_archive_events = [
            event.return_dict(txt=False) for event in archive_events
        ]
        with open(self.out_archived().path, 'w', encoding='utf-8') as file_out:
            json.dump(out_archive_events, file_out)

        # write active events
        print('Writing active events')
        out_active_events = [
            event.return_dict(txt=False) for event in active_events
        ]
        with open(self.out_active_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_active_events, file_out)
예제 #2
0
    def run(self):

        # read in events
        print('Reading in events')
        with open(self.in_deduplicated_events().path, 'r',
                  encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed)
            event_objs.append(eventobj)

        # initialize event enhancer
        print('Enhancing events')
        enhancer = event_enhancer.EventEnhancer()
        enhancer.set_events(event_objs)
        enhancer.enhance()
        enhanced_events = enhancer.return_events()

        # write deduplicated
        out_enhanced_events = [
            event.return_dict() for event in enhanced_events
        ]
        with open(self.out_enhanced_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_enhanced_events, file_out)
예제 #3
0
    def run(self):

        # read in events
        with open(self.in_enhanced_events().path, 'r', encoding = 'utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed)
            event_objs.append(eventobj)

        # initialize event merger
        print('Merging; number of events at start:',len(event_objs))
        overlap_threshold = float(self.overlap_threshold)
        merger = event_merger.EventMerger()
        merger.add_events(event_objs)
        merger.find_merges(overlap_threshold)
        events_merged = merger.return_events()
        print('Merging again; current number of events:',len(events_merged))
        merger2 = event_merger.EventMerger()
        merger2.add_events(events_merged)
        merger2.find_merges(overlap_threshold)
        events_merged_final = merger2.return_events()        
        print('Done. number of events after merge:',len(events_merged_final))

        # write merged 
        out_merged_events = [event.return_dict(txt=False) for event in events_merged_final]
        with open(self.out_merged_events().path,'w',encoding='utf-8') as file_out:
            json.dump(out_merged_events,file_out)
예제 #4
0
    def run(self):

        # read in events
        print('Reading in events')
        with open(self.in_merged_events().path, 'r',
                  encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed, txt=False)
            event_objs.append(eventobj)

        print('Reading in citylist')
        # read in citylist
        with open(self.citylist, 'r', encoding='utf-8') as file_in:
            citylist = [
                line.strip() for line in file_in.read().strip().split('\n')
            ]

        # initialize event filter
        print('Filtering; number of events at start:', len(event_objs))
        filter = event_filter.EventFilter()
        filter.add_events(event_objs)
        filter.apply_filter(citylist)
        events_filtered = filter.return_events()
        print('Done. number of events after filter:', len(events_filtered))

        # write filter
        out_filtered_events = [
            event.return_dict(txt=False) for event in events_filtered
        ]
        with open(self.out_filtered_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_filtered_events, file_out)
예제 #5
0
    def run(self):

        # if directory does not exist, create directory
        if not os.path.isdir(self.out_eventdir().path):
            self.setup_output_dir(self.out_eventdir().path)

        # collect tweet files
        end_date_year = self.end_date[:4]
        end_date_month = self.end_date[4:6]
        end_date_day = self.end_date[6:]
        last_date = datetime.date(int(end_date_year), int(end_date_month),
                                  int(end_date_day))
        first_date = last_date - datetime.timedelta(days=self.window_size - 1)
        last_tweetfile = self.in_tweetdir(
        ).path + '/' + end_date_year + end_date_month + '/' + end_date_year + end_date_month + end_date_day + '-23.out.dateref.cityref.entity.json'
        days_tweetfiles = helpers.return_tweetfiles_window(
            last_tweetfile, self.window_size - 1)
        tweetfiles = []
        for day in days_tweetfiles:
            tweetfiles.extend([
                filename for filename in glob.glob(self.in_tweetdir().path +
                                                   '/' + day + '*')
            ])

        print('Reading in citylist')
        # read in citylist
        with open(self.citylist, 'r', encoding='utf-8') as file_in:
            citylist = [
                line.strip() for line in file_in.read().strip().split('\n')
            ]

        # extract events
        er = event_ranker.EventRanker()
        # read in tweets
        print('Reading in tweets')
        for tweetfile in tweetfiles:
            date = helpers.return_date_entitytweetfile(tweetfile)
            with open(tweetfile, 'r', encoding='utf-8') as file_in:
                tweetdicts = json.loads(file_in.read())
            # format as tweet objects
            for td in tweetdicts:
                if not (td['refdates'] == {} and td['entities'] == {}):
                    tweetobj = tweet.Tweet()
                    tweetobj.import_tweetdict(td)
                    er.add_tweet(tweetobj)
                er.tweet_count += 1

        # extract events
        print('Performing event extraction')
        er.extract_events(self.minimum_event_mentions, self.cut_off)
        filter = event_filter.EventFilter()
        filter.add_events(er.events)
        filter.apply_filter(citylist)
        events_filtered = filter.return_events()
        print('Done. Extracted', len(events_filtered), 'events')

        # write to file
        outevents = [event.return_dict() for event in events_filtered]
        with open(self.out_events().path, 'w', encoding='utf-8') as file_out:
            json.dump(outevents, file_out)
예제 #6
0
    def run(self):

        # read in events
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed)
            event_objs.append(eventobj)

        # initialize event deduplicator
        similarity_threshold = float(self.similarity_threshold)
        print('Deduplicating; number of events at start:', len(event_objs))
        deduplicator = event_deduplicator.EventDeduplicator()
        deduplicator.set_events(event_objs)
        deduplicator.deduplicate_events(similarity_threshold)
        deduplicated_events = deduplicator.return_events()
        print('Done. number of events after deduplication:',
              len(deduplicated_events))

        # write deduplicated
        out_deduplicated_events = [
            event.return_dict() for event in deduplicated_events
        ]
        with open(self.out_deduplicated_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_deduplicated_events, file_out)
예제 #7
0
    def run(self):

        # read in new events
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            new_eventdicts = json.loads(file_in.read())
        new_event_objs = []
        for ed in new_eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed, txt=False)
            new_event_objs.append(eventobj)
        earliest_date = min([event.datetime for event in new_event_objs])

        # read in current events
        with open(self.current_events, 'r', encoding='utf-8') as file_in:
            current_eventdicts = json.loads(file_in.read())
        current_event_objs = []
        current_event_objs_candidates = []
        for ed in current_eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed, txt=False)
            if eventobj.datetime >= earliest_date:
                current_event_objs_candidates.append(eventobj)
            else:
                current_event_objs.append(eventobj)

        # initialize event merger
        merger = event_merger.EventMerger()
        merger.add_events(current_event_objs_candidates)

        # merge before integration
        print(
            'Merging new events before integration; number of events at start:',
            len(new_event_objs))
        overlap_threshold = float(self.overlap_threshold)
        premerger = event_merger.EventMerger()
        premerger.add_events(new_event_objs)
        premerger.find_merges(overlap_threshold)
        new_events_merged = premerger.return_events()
        print('Done. New events after merge:', len(new_events_merged))

        # integrate each event into the current ones
        print('Starting integrating new events; number of current events:',
              len(current_event_objs))
        for new_event in new_events_merged:
            merger.find_merge(new_event, overlap_threshold)

        # write merged
        integrated_events = merger.return_events() + current_event_objs
        print('Done. Number of events after integration:',
              len(integrated_events))
        out_integrated_events = [
            event.return_dict(txt=False) for event in integrated_events
        ]
        with open(self.out_integrated_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_integrated_events, file_out)
예제 #8
0
    def run(self):

        # initiate directory
        self.setup_output_dir(self.out_archivedir().path)

        # read events
        datebound = datetime.now() - datetime.timedelta(days=100)
        date_events = defaultdict(list)
        active_events = []
        print('Reading events')
        with open(self.in_events().path, 'r', encoding='utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
            for i, ed in enumerate(eventdicts):
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                if eventobj.datetime < datebound:
                    date_events[''.join(
                        str(eventobj.datetime).split()[0].split('-'))].append(
                            eventobj)
                else:
                    active_events.append(eventobj)

        # write archives
        print('Writing archives')
        for date in sorted(list(date_events.keys())):
            print(date)
            events = date_events[date]
            out_events = [event.return_dict(txt=False) for event in events]
            outfile = self.out_archivedir().path + '/events_' + date + '.json'
            with open(outfile, 'w', encoding='utf-8') as file_out:
                json.dump(out_events, file_out)

        # write active events
        print('Writing active events')
        out_active_events = [
            event.return_dict(txt=False) for event in active_events
        ]
        with open(self.out_active_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_active_events, file_out)
예제 #9
0
    def run(self):

        # collect all event files with extension '.enhanced'
        enhanced_events = glob.glob(self.in_eventdir().path + '/*.enhanced')

        # initialize
        merger = event_merger.EventMerger()
        overlap_threshold = float(self.overlap_threshold)

        # for each event file
        for eventfile in enhanced_events:
            print('Reading', eventfile)
            with open(eventfile, 'r', encoding='utf-8') as file_in:
                current_eventdicts = json.loads(file_in.read())
            new_event_objs = []
            for ed in current_eventdicts:
                eventobj = event.Event()
                eventobj.import_eventdict(ed)
                new_event_objs.append(eventobj)
            # merge before integration
            print(
                'Merging new events before integration; number of events at start:',
                len(new_event_objs))
            premerger = event_merger.EventMerger()
            premerger.add_events(new_event_objs)
            premerger.find_merges(overlap_threshold)
            new_events_merged = premerger.return_events()
            print('Done. New events after merge:', len(new_events_merged))
            if len(merger.events) == 0:
                merger.add_events(new_events_merged)
            else:
                # integrate each event into the current ones
                print(
                    'Starting integrating new events; number of current events:',
                    len(merger.events))
                for new_event in new_events_merged:
                    merger.find_merge(new_event, overlap_threshold)

        # write merged
        integrated_events = merger.return_events()
        print('Done. Number of events after integration:',
              len(integrated_events))
        out_integrated_events = [
            event.return_dict() for event in integrated_events
        ]
        with open(self.out_integrated_events().path, 'w',
                  encoding='utf-8') as file_out:
            json.dump(out_integrated_events, file_out)
예제 #10
0
    def run(self):

        # read prediction data
        with open(self.in_predictiondir().path + '/events_meta.txt','r',encoding='utf=8') as file_in:
            meta = file_in.read().strip().split('\n')

        with open(self.in_predictiondir().path + '/events_text.predictions.txt','r',encoding='utf=8') as file_in:
            predictions = file_in.read().strip().split('\n')

        with open(self.in_predictiondir().path + '/events_text.full_predictions.txt','r',encoding='utf=8') as file_in:
            lines = file_in.read().strip().split('\n')
        label_order = lines[0].split('\t')
        full_predictions = [line.split('\t') for line in lines[1:]]

        print('Meta',len(meta))
        print('Predictions',len(predictions))
        print('Full predictions',len(full_predictions))
        
        # read in events
        print('Reading in events')
        with open(self.in_events().path, 'r', encoding = 'utf-8') as file_in:
            eventdicts = json.loads(file_in.read())
        event_objs = []
        for ed in eventdicts:
            eventobj = event.Event()
            eventobj.import_eventdict(ed,txt=self.text)
            event_objs.append(eventobj)

        # index events
        id_event = {}
        for eo in event_objs:
            id_event[eo.mongo_id] = eo

        # for each prediction
        for i,mid in enumerate(meta):
            prediction = predictions[i]
            prediction_score = dict(zip(label_order,full_predictions[i]))
            eo = id_event[mid]
            eo.eventtype = prediction
            eo.eventtype_scores = prediction_score

        # write output
        out_updated_events = [event.return_dict(txt=self.text) for event in event_objs]
        with open(self.out_updated_events().path,'w',encoding='utf-8') as file_out:
            json.dump(out_updated_events,file_out)
예제 #11
0
    def run(self):

        # create directory
        self.setup_output_dir(self.out_eventdir().path)

        # set dates
        end_date_year = self.end_date[:4]
        end_date_month = self.end_date[4:6]
        end_date_day = self.end_date[6:]
        last_date = datetime.date(int(end_date_year), int(end_date_month),
                                  int(end_date_day))
        start_date_year = self.start_date[:4]
        start_date_month = self.start_date[4:6]
        start_date_day = self.start_date[6:]
        first_date = datetime.date(int(start_date_year), int(start_date_month),
                                   int(start_date_day))

        print('Reading in citylist')
        # read in citylist
        with open(self.citylist, 'r', encoding='utf-8') as file_in:
            citylist = [
                line.strip() for line in file_in.read().strip().split('\n')
            ]

        # perform event extraction on first tweet window
        print('Reading in first window of tweets')
        cursor_date = first_date
        date_tweetobjects = defaultdict(list)
        date_tweetcounts = defaultdict(int)
        window_dates = []
        while cursor_date < first_date + datetime.timedelta(
                days=self.window_size):
            print(cursor_date)
            tweetfiles = [
                filename for filename in
                glob.glob(self.in_tweetdir().path + '/' +
                          helpers.return_timeobj_date(cursor_date) + '*')
            ]
            for tweetfile in tweetfiles:
                # read in tweets
                with open(tweetfile, 'r', encoding='utf-8') as file_in:
                    tweetdicts = json.loads(file_in.read())
                date_tweetcounts[cursor_date] += len(tweetdicts)
                for td in tweetdicts:
                    if not (td['refdates'] == {} and td['entities'] == {}):
                        tweetobj = tweet.Tweet()
                        tweetobj.import_tweetdict(td)
                        date_tweetobjects[cursor_date].append(tweetobj)
            window_dates.append(cursor_date)
            cursor_date += datetime.timedelta(days=1)
        # start event extraction
        print('Loading tweets into event extractor')
        er = event_ranker.EventRanker()
        for date in window_dates:
            for tweetobject in date_tweetobjects[date]:
                er.add_tweet(tweetobject)
            er.tweet_count += date_tweetcounts[date]
        print('Performing event extraction')
        er.extract_events(self.minimum_event_mentions, self.cut_off)
        filter = event_filter.EventFilter()
        filter.add_events(er.events)
        filter.apply_filter(citylist)
        events_filtered = filter.return_events()
        print('Done. Extracted', len(events_filtered), 'events')
        # write to file
        outevents = [event.return_dict() for event in events_filtered]
        with open(self.out_eventdir().path + '/' +
                  str(cursor_date - datetime.timedelta(days=1)).replace(
                      '-', '') + '.events',
                  'w',
                  encoding='utf-8') as file_out:
            json.dump(outevents, file_out)

        # slide window forward and perform event extraction until last date
        print('Starting slider')
        window_tail = first_date
        window_head = cursor_date - datetime.timedelta(days=1)
        while window_head <= last_date:
            print('Discarding and collecting tweets')
            end_slider = window_head + datetime.timedelta(days=self.slider)
            while window_head < end_slider:
                # remove tweets of tail
                print('Deleting records for old tail', window_tail)
                del date_tweetobjects[window_tail]
                del date_tweetcounts[window_tail]
                window_tail = window_tail + datetime.timedelta(days=1)
                window_head = window_head + datetime.timedelta(days=1)
                print('Collecting tweets for new head', window_head)
                tweetfiles = [
                    filename for filename in
                    glob.glob(self.in_tweetdir().path + '/' +
                              helpers.return_timeobj_date(window_head) + '*')
                ]
                for tweetfile in tweetfiles:
                    # read in tweets
                    with open(tweetfile, 'r', encoding='utf-8') as file_in:
                        tweetdicts = json.loads(file_in.read())
                    date_tweetcounts[window_head] += len(tweetdicts)
                    for td in tweetdicts:
                        if not (td['refdates'] == {} and td['entities'] == {}):
                            tweetobj = tweet.Tweet()
                            tweetobj.import_tweetdict(td)
                            date_tweetobjects[window_head].append(tweetobj)
            # add tweets to event ranker
            print('Loading tweets into event extractor for', window_tail, '-',
                  window_head)
            er = event_ranker.EventRanker()
            window_dates = helpers.return_daterange(window_tail,
                                                    self.window_size)
            for date in window_dates:
                for tweetobject in date_tweetobjects[date]:
                    er.add_tweet(tweetobject)
                er.tweet_count += date_tweetcounts[date]
            print('Performing event extraction')
            er.extract_events(self.minimum_event_mentions, self.cut_off)
            filter = event_filter.EventFilter()
            filter.add_events(er.events)
            filter.apply_filter(citylist)
            events_filtered = filter.return_events()
            print('Done. Extracted', len(events_filtered), 'events')
            # write to file
            outevents = [event.return_dict() for event in events_filtered]
            with open(self.out_eventdir().path + '/' +
                      str(window_head).replace('-', '') + '.events',
                      'w',
                      encoding='utf-8') as file_out:
                json.dump(outevents, file_out)