예제 #1
0
def export():
    es = Elastic()

    query = {}
    tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query)
    n = 1
    with gzip.open('tweets.gz', 'wt', encoding='utf-8') as f:
        for tweet in tweets:
            if not n % 1000:
                print(f"{n} - {datetime.now()}")
            tweet = tweet['_source']
            if 'locations' in tweet:
                n += 1
                ID = tweet['id']
                text = clean_text(tweet['text'], lower=False)
                f.write(f'{ID}\t{text}\n')
예제 #2
0
    def analyze_tweets_subbasin(self, subbasin, languages=None):
        from db.elastic import Elastic
        es = Elastic()
        query = {
            'query': {
                'term': {
                    'locations.subbasin_ids_9': subbasin
                }
            },
            'sort': {
                'date': 'asc'
            }
        }

        data = []
        tweets = es.scroll_through(index='floods_all',
                                   body=query,
                                   source=False)
        for tweet in tweets:
            detailed_locations = [
                loc for loc in tweet['locations']
                if loc['type'] in ('town', 'adm5', 'adm4', 'adm3', 'landmark')
            ]
            if len(detailed_locations) != 1:
                continue

            detailed_location = detailed_locations[0]
            if subbasin not in detailed_location['subbasin_ids_9']:
                continue

            if detailed_location['score'] < .2:
                continue

            tweet_lang = tweet['source']['lang']
            if languages and tweet_lang not in languages:
                continue

            data.append((subbasin, tweet['id'], tweet['date'], tweet['text'],
                         tweet_lang, None))
        self.process(data, res_file=subbasin, include_context='hydrology')
예제 #3
0
class Detection(Geoparser):
    def __init__(
        self,
        doc_loader,
        n_words,
        classify_tweets,
        minimum_gram_length,
        max_distance_entities_doc,
        doc_score_types,
    ):
        """Get out doc_analyzer, save the minimum score neccesary for docs
        and if the event detection module is turned on, initalize the class
        for that (spinup)"""
        self.n_words = n_words
        self.classify_tweets = classify_tweets
        self.es = Elastic(host=ELASTIC_HOST)
        self.check_toponym_index()
        self.pg = PostgreSQL('gfm')
        super().__init__(self.pg, self.es, doc_score_types,
                         max_distance_entities_doc)
        if self.classify_tweets == 'bert':
            self.text_classifier = TextClassifier()
        self.docs = {}
        doc_loader_args = (doc_score_types, n_words, minimum_gram_length)
        from doc_loader import DocLoaderES
        self.doc_loader = DocLoaderES(*doc_loader_args)

    def check_toponym_index(self):
        if not self.es.indices.exists("locations"):
            print("Toponym index does not exist")
            sys.exit()

    def maybe_set_table_name(self, regions, detection_parameters):
        for i, setting in enumerate(detection_parameters):
            if setting.run_name is None:
                run_name = ("floods"
                            f"_{regions}"
                            f"_{int(setting.location_threshold*100)}"
                            f"_{setting.factor}_{int(setting.fraction*10)}"
                            f"_{setting.base}")
                detection_parameters[i] = setting._replace(run_name=run_name)

    def initial_detection(
        self,
        start,
        end,
    ):
        print("Initial detection")
        for query_start, query_end in daterange(start,
                                                end,
                                                timedelta(days=1),
                                                ranges=True):
            query_end = min(query_end, end)
            print("Initial detection:", query_start, "-", query_end)
            query = self.es.build_date_query(
                query_start,
                query_end,
                locations=True,
            )
            query['query']['bool']['must'].append(
                {'term': {
                    'event_related': True
                }})
            documents = self.es.scroll_through(index=DOCUMENT_INDEX,
                                               body=query,
                                               source=False)
            self.event_detector.detect_events_l(documents,
                                                is_real_time=mp.Value(
                                                    c_bool, False),
                                                convert_to_named_tuple=True)
        print("Finished initial detection")

    def run(
        self,
        start,
        spinup_time,
        timestep_length,
        analysis_length,
        detection_parameters,
        regions,
        real_time,
        max_n_docs_in_memory=None,
        check_previous_docs=True,
        geoparsing_start=False,
        update_locations=True,
        end=False,
        load_detectors=False,
        detection=True,
    ):
        """This program uses 2 processes. The main process (this one) that
        analyzes groups of docs and detects based on this. In addition a
        child process is spawned that reads the docs from the database or
        receives them from a stream. This process is the doc_loader.
        Two events, event_1 and event_2, regulate the execution of both
        processes. First the doc_loader loads the docs used for the spinup
        from the database, then the docs for the first timestep, which are
        all put in a queue (docs_queue). Then this one of the events is
        released, while the doc_loader is paused. The execution of the main
        process is restarted. First it unloads the docs from the docs_queue
        and releases the doc_loader again. This process then iterates."""
        if not update_locations:
            print("WARNING: Not updating locations")

        # Check if timestep not bigger than analysis length
        if timestep_length > analysis_length:
            print("Timestep too big")
            sys.exit(0)

        # Set parameters for sharing between processes
        n_docs_to_unload = Counter(0)
        timestep_end_str = mp.Array('c', 26)
        docs_queue = mp.Queue()
        event_1 = mp.Event()
        event_2 = mp.Event()
        is_real_time = mp.Value(c_bool, False)

        end_date_spinup = start + spinup_time
        if geoparsing_start:
            if geoparsing_start < start:
                print("ERROR: Geoparsing start is smaller than start date")
                sys.exit()
            geoparsing_start = int((geoparsing_start - start) /
                                   timestep_length) * timestep_length + start
            print("Geoparsing start:", geoparsing_start)
            doc_loader_start = geoparsing_start
        else:
            doc_loader_start = start

        doc_loader_mp = Process(target=self.doc_loader.load_docs,
                                args=(docs_queue, n_docs_to_unload,
                                      doc_loader_start, analysis_length,
                                      timestep_length, event_1, event_2,
                                      timestep_end_str, is_real_time))
        doc_loader_mp.daemon = True
        doc_loader_mp.start()

        if detection and geoparsing_start and geoparsing_start > end_date_spinup:
            self.event_detector = EventDetector(
                self.pg,
                self.es,
                start,
                spinup_time,
                detection_parameters=detection_parameters,
                regions=regions,
                load_detectors=load_detectors,
            )
            self.initial_detection(start, geoparsing_start)
            end_date_spinup = None

        while True and (real_time or not is_real_time.value):
            event_1.wait()
            if doc_loader_mp.exception is not None:
                _, traceback = doc_loader_mp.exception
                print(traceback)
                sys.exit()

            unloaded_docs = []
            for i in range(n_docs_to_unload.value()):
                unloaded_docs.append(docs_queue.get())
                n_docs_to_unload.decrease()

            if self.classify_tweets == 'bert':
                about_ongoing_event_docs = []
                about_ongoing_event_doc_ids = set()
                classified_docs = set()

                # Check whether documents are already classified in ES. If so, load classification from ES.
                if unloaded_docs:
                    documents = self.es.mget(
                        index=DOCUMENT_INDEX,
                        body={'ids': [ID for ID, _ in unloaded_docs]})['docs']
                for doc in documents:
                    doc = doc['_source']
                    if 'event_related' in doc:
                        classified_docs.add(doc['id'])
                        if doc['event_related'] is True:
                            about_ongoing_event_doc_ids.add(doc['id'])

                for doc in unloaded_docs:
                    if doc[0] in about_ongoing_event_doc_ids:
                        about_ongoing_event_docs.append(doc)

                docs_to_classify = []
                examples_to_classify = []
                for doc in unloaded_docs:
                    ID, doc_info = doc
                    if ID not in classified_docs:
                        example = {
                            'id': ID,
                            'sentence1': doc_info.clean_text,
                            'label': 0
                        }
                        examples_to_classify.append(example)
                        docs_to_classify.append(doc)

                classes = self.text_classifier(examples_to_classify)

                assert len(classes) == len(docs_to_classify)
                es_update = []
                for doc_class, doc in zip(classes, docs_to_classify):
                    doc_class = True if doc_class == 'yes' else False
                    if doc_class is True:
                        about_ongoing_event_docs.append(doc)
                    es_update.append({
                        'doc': {
                            'event_related': doc_class
                        },
                        '_index': DOCUMENT_INDEX,
                        '_id': doc[0],
                        '_op_type': 'update',
                    })

                self.es.bulk_operation(es_update)

                about_ongoing_event_docs = sorted(about_ongoing_event_docs,
                                                  key=lambda x: x[1].date,
                                                  reverse=False)

                self.docs.update(dict(about_ongoing_event_docs))
            elif self.classify_tweets == 'db':
                # Check whether documents are already classified in ES. If so, load classification from ES.
                about_ongoing_event_docs = []
                if unloaded_docs:
                    documents = self.es.mget(
                        index=DOCUMENT_INDEX,
                        body={'ids': [ID for ID, _ in unloaded_docs]})['docs']
                    for doc in documents:
                        doc = doc['_source']
                        if doc['event_related'] is True:
                            about_ongoing_event_doc_ids.add(doc['id'])

                    for doc in unloaded_docs:
                        if doc[0] in about_ongoing_event_doc_ids:
                            about_ongoing_event_docs.append(doc)
                self.docs.update(dict(about_ongoing_event_docs))
            else:
                self.docs.update(dict(unloaded_docs))

            if max_n_docs_in_memory is not None and len(
                    self.docs) > max_n_docs_in_memory:
                n_docs_to_delete = len(self.docs) - max_n_docs_in_memory
                IDs_to_remove = list(self.docs.keys())[:n_docs_to_delete]
                for ID in IDs_to_remove:
                    del self.docs[ID]

            event_1.clear()
            event_2.set()
            near_end_date_spinup = False
            if self.docs:
                timestep_end = str(timestep_end_str.value, 'utf-8')
                timestep_end = isoformat_2_date(timestep_end)
                l_docs = []

                if detection and end_date_spinup and timestep_end >= end_date_spinup:
                    self.event_detector = EventDetector(
                        self.pg,
                        self.es,
                        start,
                        spinup_time,
                        detection_parameters=detection_parameters,
                        load_detectors=load_detectors,
                        regions=regions)
                    self.initial_detection(start,
                                           timestep_end - analysis_length)
                    near_end_date_spinup = True

                for ID, doc in self.docs.items():
                    if doc.date > timestep_end - analysis_length:
                        break
                    else:
                        l_docs.append(ID)

                for i, ID in enumerate(l_docs):
                    l_docs[i] = self.docs[ID]
                    del self.docs[ID]

                self.geoparse_timestep(timestep_end,
                                       update_locations=update_locations)
                if detection and not end_date_spinup and (
                        not geoparsing_start
                        or timestep_end > geoparsing_start + analysis_length):
                    self.event_detector.detect_events_l(
                        l_docs, is_real_time=is_real_time)
                    self.event_detector.detect_events_s(
                        self.docs.values(), is_real_time=is_real_time)
                if near_end_date_spinup:
                    end_date_spinup = None

                if end and timestep_end > end:
                    return None
예제 #4
0
def classify():
    es = Elastic()

    classify_per = 10_000

    if refresh:
        remove_field_from_index(DOCUMENT_INDEX, 'event_related')

    predictor = Predictor()

    query = {
        'query': {
            "bool": {
                "must": [
                    {
                        'exists': {
                            'field': 'locations'
                        }
                    }
                ],
                "must_not": {
                    'exists': {
                        'field': 'event_related'
                    }
                }
            }
        }
    }
    n = es.n_hits(index=DOCUMENT_INDEX, body=query)
    tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query)
    tweet_subset = []
    for i, tweet in enumerate(tweets):
        if not i % classify_per:
            print(f"{i}/{n} ({int(i/n*100)}%) - {datetime.now()}")
        tweet_subset.append(tweet)

        if len(tweet_subset) == classify_per:
            IDs = []
            examples = []
            for tweet in tweet_subset:
                tweet = tweet['_source']
                IDs.append(tweet['id'])
                example = {
                    "id": tweet['id'],
                    "sentence1": clean_text(tweet['text'], lower=False),
                    "label": 0
                }
                examples.append(example)

            labels = predictor(examples)
            es_update = []
            for ID, label in zip(IDs, labels):
                es_update.append({
                    'doc': {
                        'event_related': True if label == 'yes' else False
                    },
                    '_index': DOCUMENT_INDEX,
                    '_id': ID,
                    '_op_type': 'update',
                })

            es.bulk_operation(es_update)

            tweet_subset = []