def _spinup_detectors(self, detectors, start, spinup_time): print("spinup detectors") for query_start, query_end in daterange(start, start + spinup_time, timedelta(hours=24), ranges=True): query = self.es.build_date_query( query_start, min(query_end, start + spinup_time), locations=True, ) query['query']['bool']['must'].append( {'term': { 'event_related': True }}) print(f'{query_start}:', self.es.n_hits(index=DOCUMENT_INDEX, body=query), 'docs') docs = self.es.scroll_through(index=DOCUMENT_INDEX, body=query, source=False) for doc in docs: doc = self.doc_to_namedtuple(doc) self.maybe_send_doc_to_detector(doc, detectors, 'spinup') for detectors_per_setting in detectors.values(): for detector in detectors_per_setting.values(): detector.initialize() return detectors
def calculate_lim(self, from_dt, to_dt, x=30, inf=np.inf, array=np.array, percentile=np.percentile): """Calculate the limit.""" assert isinstance(from_dt, date) assert isinstance(to_dt, date) if self.n_docs_per_day: values = [ self.n_docs_per_day[day] for day in daterange(from_dt, to_dt, timedelta(days=1)) ] if values: values = array(values) self.norm_n_docs_per_day = (percentile(values, x) + percentile(values, 100 - x)) / 2 if self.norm_n_docs_per_day == 0: limit = inf else: normal_gap_between_docs = 24 * 3600 / self.norm_n_docs_per_day limit = normal_gap_between_docs * self.fraction else: self.norm_n_docs_per_day = 0 limit = inf else: self.norm_n_docs_per_day = 0 limit = inf assert limit >= 0 return limit
def initial_detection( self, start, end, ): print("Initial detection") for query_start, query_end in daterange(start, end, timedelta(days=1), ranges=True): query_end = min(query_end, end) print("Initial detection:", query_start, "-", query_end) query = self.es.build_date_query( query_start, query_end, locations=True, ) query['query']['bool']['must'].append( {'term': { 'event_related': True }}) documents = self.es.scroll_through(index=DOCUMENT_INDEX, body=query, source=False) self.event_detector.detect_events_l(documents, is_real_time=mp.Value( c_bool, False), convert_to_named_tuple=True) print("Finished initial detection")
def sample_per_day_per_adm_languages(languages, max_count=1): start_query = datetime(2014, 7, 29) end_query = datetime(2018, 11, 20) days = list( daterange(start_query, end_query, timedelta(days=1), include_last=False)) for language in languages: print(language) query = { 'query': { 'bool': { 'must': [{ 'term': { 'source.lang': language } }, { 'exists': { 'field': 'text' } }, { 'term': { 'source.retweet': False } }] } } } print(es.n_hits(index=index, body=query)) while True: pg.cur.execute( """ SELECT COUNT(*) FROM classification WHERE language_code = %s """, (language, )) count = pg.cur.fetchone()[0] print(count, '\r') if count >= max_count: break # pick random day day = choice(days) all_adm = [] print(day) for level in ('level_0', 'level_1'): query = { "size": 0, "query": { "bool": { "must": [{ "term": { "source.lang": language } }, { "range": { "date": { "gte": day.isoformat(), "lt": (day + timedelta(days=1)).isoformat() } } }, { "term": { "source.retweet": False } }] } }, "aggs": { 'adm': { "terms": { "field": f"locations.{level}_region", "size": 500_000 } } } } res = es.search(index=index, body=query)['aggregations']['adm'] assert res['doc_count_error_upper_bound'] == 0 all_adm.extend([(bucket['key'], level) for bucket in res['buckets']]) print(all_adm) # check if list not empty if all_adm: adm, level = choice(all_adm) tweet = get_tweet(day, day + timedelta(days=1), adm, level, language) print(day, all_adm) if tweet: tweet_id, text, date = tweet pg.cur.execute( """ INSERT INTO classification (id, txt, date, language_code) VALUES (%s, %s, %s, %s) ON CONFLICT DO NOTHING """, (tweet_id, text, date, language)) pg.conn.commit() pg.conn.commit()