def __init__( self, doc_loader, n_words, classify_tweets, minimum_gram_length, max_distance_entities_doc, doc_score_types, ): """Get out doc_analyzer, save the minimum score neccesary for docs and if the event detection module is turned on, initalize the class for that (spinup)""" self.n_words = n_words self.classify_tweets = classify_tweets self.es = Elastic(host=ELASTIC_HOST) self.check_toponym_index() self.pg = PostgreSQL('gfm') super().__init__(self.pg, self.es, doc_score_types, max_distance_entities_doc) if self.classify_tweets == 'bert': self.text_classifier = TextClassifier() self.docs = {} doc_loader_args = (doc_score_types, n_words, minimum_gram_length) from doc_loader import DocLoaderES self.doc_loader = DocLoaderES(*doc_loader_args)
def once_a_day(): elastic = Elastic('elastic:9200', 'steam_tmp') log.info('Updating data from Steam API!') games = get_games_db() for game in games: game_id, game_name = int(game[0]), str(game[1]) log.info('Starting the extraction of game: %s - %s', game_id, game_name) try: gm = steam_api.get_game(game_id, 'temporal') log.info('Steam API: successed!') gm.update(steam_spy.get_game(game_id, 'temporal')) log.info('Steam SPY: successed!') gm.update(steam_currency.get_game(game_id, 'temporal')) log.info('Steam Currency: successed!') log.info('Starting insersion in the Elasticsearch') elastic.update(game_id, gm, 'game_tmp') log.info('Finishing insersion in the Elasticsearch') except Exception as error: if type(error) == GameNotFound: log.warning(error) else: log.error(error) time.sleep(300) games.append(game)
def insert_new_games(): elastic = Elastic('elastic:9200', 'steam_est') log.info('Insert new games on Elasticsearch!') fail_id = open("ids_fails.txt", "a") lst1 = get_games_db() lst2 = get_all_games() games = [game for game in lst2 if game not in lst1] for game in games: game_id, game_name = int(game[0]), str(game[1]) log.info('Starting the extraction of game: %s - %s', game_id, game_name) try: game = steam_api.get_game(game_id, 'estastic') log.info('Steam API: successed!') game.update(steam_spy.get_game(game_id, 'estastic')) log.info('Steam SPY: successed!') log.info('Starting insersion in the Elasticsearch') elastic.update(game_id, game, 'game_est') log.info('Finishing insersion in the Elasticsearch') except Exception as error: if type(error) == GameNotFound: log.warning(error) else: log.error(error) time.sleep(300) fail_id.write(str(game_id) + " || " + str(game_name) + "\n")
class Fill: def __init__(self): self.keywords = self.set_keywords() self.es = Elastic() def set_keywords(self): df = pd.read_excel('input/twitter_supported_languages.xlsx') df = df[df['implemented'] == True].set_index( 'language_code')['floods_filtered'] keywords = {} for language, words in df.iteritems(): keywords[language] = set( [word.strip().lower() for word in words.split(',')]) return keywords def generate_tweets(self, fp, start=datetime(1970, 1, 1)): with open(fp, 'rb') as f: for tweet in f.readlines(): tweet = tweet.strip() try: tweet = json.loads(tweet) except json.decoder.JSONDecodeError: continue try: language = tweet['lang'] except KeyError: continue clean_text = sanitize.clean_text(tweet['text'], lower=False) clean_text_lower = clean_text.lower() try: if not any(keyword in clean_text_lower for keyword in self.keywords[language]): continue except KeyError: continue yield tweet def prepare_doc(self, json_doc): doc2es = tweet_parser(json_doc) doc2es['_index'] = DOCUMENT_INDEX doc2es['_id'] = doc2es['id'] doc2es['_type'] = '_doc' doc2es['source']['type'] = 'tweet' return doc2es def prepare_docs(self, docs): for doc in docs: doc2es = self.prepare_doc(doc) if doc2es: yield doc2es def commit_docs(self, docs): self.es.bulk_operation(docs) def __call__(self, fp): tweets = self.generate_tweets(fp) tweets = self.prepare_docs(tweets) self.commit_docs(tweets)
def gzip_to_es(move_per=10000): es = Elastic() def get_labels(): with gzip.open('tweets_labelled.gz', 'rt', encoding='utf-8') as f: for line in f.readlines(): ID, label = line.strip().split('\t') yield ID, label def move_to_db(labels): es_update = [] for ID, label in labels: es_update.append({ 'doc': { 'event_related': True if label == 'yes' else False }, '_index': DOCUMENT_INDEX, '_id': ID, '_op_type': 'update', }) es.bulk_operation(es_update) for i, labels in enumerate(chunker(get_labels(), move_per)): print(i) move_to_db(labels)
def export(): es = Elastic() query = {} tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query) n = 1 with gzip.open('tweets.gz', 'wt', encoding='utf-8') as f: for tweet in tweets: if not n % 1000: print(f"{n} - {datetime.now()}") tweet = tweet['_source'] if 'locations' in tweet: n += 1 ID = tweet['id'] text = clean_text(tweet['text'], lower=False) f.write(f'{ID}\t{text}\n')
def once_a_week(): elastic = Elastic('elastic:9200', 'steam_tmp') log.info('Updating data omce a week!') games = get_games_db() for game in games: log.info('Starting the extraction of game: %s - %s', game[0], game[1]) try: gm = youtube_api.get_game(str(game[1]), 'temporal') log.info('Youtube API: successed!') log.info('Starting update in the Elasticsearch') elastic.update(int(game[0]), gm, 'game_tmp') log.info('Finishing update in the Elasticsearch') except Exception as error: if type(error) == GameNotFound: log.warning(error) else: log.error(error) time.sleep(3600) games.append(game)
def analyze_tweets_subbasin(self, subbasin, languages=None): from db.elastic import Elastic es = Elastic() query = { 'query': { 'term': { 'locations.subbasin_ids_9': subbasin } }, 'sort': { 'date': 'asc' } } data = [] tweets = es.scroll_through(index='floods_all', body=query, source=False) for tweet in tweets: detailed_locations = [ loc for loc in tweet['locations'] if loc['type'] in ('town', 'adm5', 'adm4', 'adm3', 'landmark') ] if len(detailed_locations) != 1: continue detailed_location = detailed_locations[0] if subbasin not in detailed_location['subbasin_ids_9']: continue if detailed_location['score'] < .2: continue tweet_lang = tweet['source']['lang'] if languages and tweet_lang not in languages: continue data.append((subbasin, tweet['id'], tweet['date'], tweet['text'], tweet_lang, None)) self.process(data, res_file=subbasin, include_context='hydrology')
def try_fails_id(): elastic = Elastic('elastic:9200', 'steam_est') log.info('Trying insert the fails ids again!') games = open("ids_fails.txt", "r") for game in games: game_id, game_name = game.split(" || ") game_id = int(game_id) log.info('Starting the extraction of game: %s - %s', game_id, game_name) try: game = steam_api.get_game(game_id, 'estastic') log.info('Steam API: successed!') game.update(steam_spy.get_game(game_id, 'estastic')) log.info('Steam SPY: successed!') log.info('Starting insersion in the Elasticsearch') elastic.update(game_id, game, 'game_est') log.info('Finishing insersion in the Elasticsearch') except Exception as error: if type(error) == GameNotFound: log.warning(error) else: log.error(error) time.sleep(300) os.remove("ids_fails.txt")
class Fill: def __init__(self): self.keywords = self.set_keywords() self.es = Elastic() def set_keywords(self): df = pd.read_excel('input/twitter_supported_languages.xlsx') df = df[df['implemented'] == True].set_index( 'language_code')['floods_filtered'] keywords = {} for language, words in df.iteritems(): keywords[language] = set( [word.strip().lower() for word in words.split(',')]) return keywords def open(self, fp): if fp.endswith('.gzip') or fp.endswith('.gz'): with gzip.open(fp, 'r') as gz: for tweet in gz: yield tweet.decode('utf-8') elif fp.endswith('.jsonl'): with open(fp, 'rb') as f: for line in f.readlines(): if line.startswith('#'): continue yield line.strip() else: raise NotImplementedError( f'reader for extension {fp.split(".")[-1]} not implemented') def generate_tweets(self, fp, start=datetime(1970, 1, 1)): for tweet in self.open(fp): try: tweet = json.loads(tweet) except json.decoder.JSONDecodeError: continue yield tweet def prepare_doc(self, json_doc): if 'limit' in json_doc: return None doc2es = tweet_parser(json_doc) if not doc2es: return None language = doc2es['source']['lang'] clean_text = sanitize.clean_text(doc2es['text'], lower=False) clean_text_lower = clean_text.lower() try: if not any(keyword in clean_text_lower for keyword in self.keywords[language]): return None except KeyError: return None doc2es['_index'] = DOCUMENT_INDEX doc2es['_id'] = doc2es['id'] doc2es['source']['type'] = 'tweet' return doc2es def prepare_docs(self, docs): for doc in docs: doc2es = self.prepare_doc(doc) if doc2es: yield doc2es def commit_docs(self, docs): self.es.bulk_operation(docs) def __call__(self, fp): tweets = self.generate_tweets(fp) tweets = self.prepare_docs(tweets) self.commit_docs(tweets)
class Detection(Geoparser): def __init__( self, doc_loader, n_words, classify_tweets, minimum_gram_length, max_distance_entities_doc, doc_score_types, ): """Get out doc_analyzer, save the minimum score neccesary for docs and if the event detection module is turned on, initalize the class for that (spinup)""" self.n_words = n_words self.classify_tweets = classify_tweets self.es = Elastic(host=ELASTIC_HOST) self.check_toponym_index() self.pg = PostgreSQL('gfm') super().__init__(self.pg, self.es, doc_score_types, max_distance_entities_doc) if self.classify_tweets == 'bert': self.text_classifier = TextClassifier() self.docs = {} doc_loader_args = (doc_score_types, n_words, minimum_gram_length) from doc_loader import DocLoaderES self.doc_loader = DocLoaderES(*doc_loader_args) def check_toponym_index(self): if not self.es.indices.exists("locations"): print("Toponym index does not exist") sys.exit() def maybe_set_table_name(self, regions, detection_parameters): for i, setting in enumerate(detection_parameters): if setting.run_name is None: run_name = ("floods" f"_{regions}" f"_{int(setting.location_threshold*100)}" f"_{setting.factor}_{int(setting.fraction*10)}" f"_{setting.base}") detection_parameters[i] = setting._replace(run_name=run_name) def initial_detection( self, start, end, ): print("Initial detection") for query_start, query_end in daterange(start, end, timedelta(days=1), ranges=True): query_end = min(query_end, end) print("Initial detection:", query_start, "-", query_end) query = self.es.build_date_query( query_start, query_end, locations=True, ) query['query']['bool']['must'].append( {'term': { 'event_related': True }}) documents = self.es.scroll_through(index=DOCUMENT_INDEX, body=query, source=False) self.event_detector.detect_events_l(documents, is_real_time=mp.Value( c_bool, False), convert_to_named_tuple=True) print("Finished initial detection") def run( self, start, spinup_time, timestep_length, analysis_length, detection_parameters, regions, real_time, max_n_docs_in_memory=None, check_previous_docs=True, geoparsing_start=False, update_locations=True, end=False, load_detectors=False, detection=True, ): """This program uses 2 processes. The main process (this one) that analyzes groups of docs and detects based on this. In addition a child process is spawned that reads the docs from the database or receives them from a stream. This process is the doc_loader. Two events, event_1 and event_2, regulate the execution of both processes. First the doc_loader loads the docs used for the spinup from the database, then the docs for the first timestep, which are all put in a queue (docs_queue). Then this one of the events is released, while the doc_loader is paused. The execution of the main process is restarted. First it unloads the docs from the docs_queue and releases the doc_loader again. This process then iterates.""" if not update_locations: print("WARNING: Not updating locations") # Check if timestep not bigger than analysis length if timestep_length > analysis_length: print("Timestep too big") sys.exit(0) # Set parameters for sharing between processes n_docs_to_unload = Counter(0) timestep_end_str = mp.Array('c', 26) docs_queue = mp.Queue() event_1 = mp.Event() event_2 = mp.Event() is_real_time = mp.Value(c_bool, False) end_date_spinup = start + spinup_time if geoparsing_start: if geoparsing_start < start: print("ERROR: Geoparsing start is smaller than start date") sys.exit() geoparsing_start = int((geoparsing_start - start) / timestep_length) * timestep_length + start print("Geoparsing start:", geoparsing_start) doc_loader_start = geoparsing_start else: doc_loader_start = start doc_loader_mp = Process(target=self.doc_loader.load_docs, args=(docs_queue, n_docs_to_unload, doc_loader_start, analysis_length, timestep_length, event_1, event_2, timestep_end_str, is_real_time)) doc_loader_mp.daemon = True doc_loader_mp.start() if detection and geoparsing_start and geoparsing_start > end_date_spinup: self.event_detector = EventDetector( self.pg, self.es, start, spinup_time, detection_parameters=detection_parameters, regions=regions, load_detectors=load_detectors, ) self.initial_detection(start, geoparsing_start) end_date_spinup = None while True and (real_time or not is_real_time.value): event_1.wait() if doc_loader_mp.exception is not None: _, traceback = doc_loader_mp.exception print(traceback) sys.exit() unloaded_docs = [] for i in range(n_docs_to_unload.value()): unloaded_docs.append(docs_queue.get()) n_docs_to_unload.decrease() if self.classify_tweets == 'bert': about_ongoing_event_docs = [] about_ongoing_event_doc_ids = set() classified_docs = set() # Check whether documents are already classified in ES. If so, load classification from ES. if unloaded_docs: documents = self.es.mget( index=DOCUMENT_INDEX, body={'ids': [ID for ID, _ in unloaded_docs]})['docs'] for doc in documents: doc = doc['_source'] if 'event_related' in doc: classified_docs.add(doc['id']) if doc['event_related'] is True: about_ongoing_event_doc_ids.add(doc['id']) for doc in unloaded_docs: if doc[0] in about_ongoing_event_doc_ids: about_ongoing_event_docs.append(doc) docs_to_classify = [] examples_to_classify = [] for doc in unloaded_docs: ID, doc_info = doc if ID not in classified_docs: example = { 'id': ID, 'sentence1': doc_info.clean_text, 'label': 0 } examples_to_classify.append(example) docs_to_classify.append(doc) classes = self.text_classifier(examples_to_classify) assert len(classes) == len(docs_to_classify) es_update = [] for doc_class, doc in zip(classes, docs_to_classify): doc_class = True if doc_class == 'yes' else False if doc_class is True: about_ongoing_event_docs.append(doc) es_update.append({ 'doc': { 'event_related': doc_class }, '_index': DOCUMENT_INDEX, '_id': doc[0], '_op_type': 'update', }) self.es.bulk_operation(es_update) about_ongoing_event_docs = sorted(about_ongoing_event_docs, key=lambda x: x[1].date, reverse=False) self.docs.update(dict(about_ongoing_event_docs)) elif self.classify_tweets == 'db': # Check whether documents are already classified in ES. If so, load classification from ES. about_ongoing_event_docs = [] if unloaded_docs: documents = self.es.mget( index=DOCUMENT_INDEX, body={'ids': [ID for ID, _ in unloaded_docs]})['docs'] for doc in documents: doc = doc['_source'] if doc['event_related'] is True: about_ongoing_event_doc_ids.add(doc['id']) for doc in unloaded_docs: if doc[0] in about_ongoing_event_doc_ids: about_ongoing_event_docs.append(doc) self.docs.update(dict(about_ongoing_event_docs)) else: self.docs.update(dict(unloaded_docs)) if max_n_docs_in_memory is not None and len( self.docs) > max_n_docs_in_memory: n_docs_to_delete = len(self.docs) - max_n_docs_in_memory IDs_to_remove = list(self.docs.keys())[:n_docs_to_delete] for ID in IDs_to_remove: del self.docs[ID] event_1.clear() event_2.set() near_end_date_spinup = False if self.docs: timestep_end = str(timestep_end_str.value, 'utf-8') timestep_end = isoformat_2_date(timestep_end) l_docs = [] if detection and end_date_spinup and timestep_end >= end_date_spinup: self.event_detector = EventDetector( self.pg, self.es, start, spinup_time, detection_parameters=detection_parameters, load_detectors=load_detectors, regions=regions) self.initial_detection(start, timestep_end - analysis_length) near_end_date_spinup = True for ID, doc in self.docs.items(): if doc.date > timestep_end - analysis_length: break else: l_docs.append(ID) for i, ID in enumerate(l_docs): l_docs[i] = self.docs[ID] del self.docs[ID] self.geoparse_timestep(timestep_end, update_locations=update_locations) if detection and not end_date_spinup and ( not geoparsing_start or timestep_end > geoparsing_start + analysis_length): self.event_detector.detect_events_l( l_docs, is_real_time=is_real_time) self.event_detector.detect_events_s( self.docs.values(), is_real_time=is_real_time) if near_end_date_spinup: end_date_spinup = None if end and timestep_end > end: return None
def load_docs(self, docs_queue, n_docs_to_unload, start, analysis_length, timestep_length, event_1, event_2, timestep_end_str, is_real_time, datetime=datetime): try: es = Elastic(host=ELASTIC_HOST) pg = PostgreSQL('gfm') doc_analyzer = DocAnalyzer(es, pg, self.doc_score_types, self.n_words, self.minimum_gram_length) spinup_start = start - analysis_length + timestep_length self.load_timestep_es(es, doc_analyzer, docs_queue, n_docs_to_unload, spinup_start, start) timestep = 1 timestep_end = start + timestep * timestep_length while timestep_end < datetime.utcnow(): query_start = timestep_end - timestep_length self.load_timestep_es(es, doc_analyzer, docs_queue, n_docs_to_unload, query_start, timestep_end) timestep_end_str.value = self.encode_dt(timestep_end) timestep += 1 timestep_end = start + timestep * timestep_length event_2.clear() event_1.set() event_2.wait() last_timestep_end = timestep_end - timestep_length is_real_time.value = True while True: timestep_end = datetime.utcnow() sleep = (timedelta(minutes=3) - (timestep_end - last_timestep_end)).total_seconds() if sleep > 0: time.sleep(sleep) timestep_end = datetime.utcnow() self.load_timestep_es(es, doc_analyzer, docs_queue, n_docs_to_unload, last_timestep_end, timestep_end) last_timestep_end = timestep_end timestep_end_str.value = self.encode_dt(timestep_end) event_2.clear() event_1.set() event_2.wait() except Exception as e: raise
}, "publishers": { "type": "keyword", "store": "true" }, "platforms": { "type": "keyword", "store": "true" }, } }, } } try: elastic = Elastic('elastic:9200', 'steam_est') log.info('Elasticsearch connected') log.info('Creating index Steam Estastic on Elasticsearch') elastic.create_index(index_body) log.info('Index Steam Created') games = get_all_games() log.debug(len(games)) for game in games: game_id, game_name = int(game[0]), str(game[1]) log.info('Starting the extraction of game: %s - %s', game_id, game_name) try: game = steam_api.get_game(game_id, 'estastic') log.info('Steam API: successed!') game.update(steam_spy.get_game(game_id, 'estastic')) log.info('Steam SPY: successed!')
from db.elastic import Elastic from config import DOC_SCORE_TYPES, DOCUMENT_INDEX es = Elastic() es.maybe_create_document_index(DOCUMENT_INDEX, DOC_SCORE_TYPES)
def classify(): es = Elastic() classify_per = 10_000 if refresh: remove_field_from_index(DOCUMENT_INDEX, 'event_related') predictor = Predictor() query = { 'query': { "bool": { "must": [ { 'exists': { 'field': 'locations' } } ], "must_not": { 'exists': { 'field': 'event_related' } } } } } n = es.n_hits(index=DOCUMENT_INDEX, body=query) tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query) tweet_subset = [] for i, tweet in enumerate(tweets): if not i % classify_per: print(f"{i}/{n} ({int(i/n*100)}%) - {datetime.now()}") tweet_subset.append(tweet) if len(tweet_subset) == classify_per: IDs = [] examples = [] for tweet in tweet_subset: tweet = tweet['_source'] IDs.append(tweet['id']) example = { "id": tweet['id'], "sentence1": clean_text(tweet['text'], lower=False), "label": 0 } examples.append(example) labels = predictor(examples) es_update = [] for ID, label in zip(IDs, labels): es_update.append({ 'doc': { 'event_related': True if label == 'yes' else False }, '_index': DOCUMENT_INDEX, '_id': ID, '_op_type': 'update', }) es.bulk_operation(es_update) tweet_subset = []
def __init__(self): self.keywords = self.set_keywords() self.es = Elastic()
from db.elastic import Elastic import sys es = Elastic() def remove_field_from_index(index, field): body = { "query": { "bool": { "must": [ { "exists": {"field": field} } ] } } } print(f"removing {es.n_hits(index=index, body=body)} documents from index '{index}'") body.update({ "script": { "inline": f"ctx._source.remove(\"{field}\")" } }) es.update_by_query(index=index, body=body, conflicts='proceed') if __name__ == '__main__': remove_field_from_index(sys.argv[-2], sys.argv[-1])
# Name of the PostgreSQL database (lowercase) POSTGRESQL_DB = 'taggs' # Name of the toponym resolution table TOPONYM_RESOLUTION_TABLE = 'toponym_resolution_table' # Refresh time of the realtime geotagging module REAL_TIME_TAGGER_REFRESH_TIME = 300 # sec # Name of the Elasticsearch index with tweets TWEETS_INDEX = 'taggs' # Name of the Elasticsearch index with toponyms TOPONYM_INDEX = 'toponyms' # Update tweets in the database with their locations (flag for testing purposes) UPDATE = False # Connect to databases es_tweets = Elastic() es_toponyms = es_tweets pg_Geotag = PostgreSQL(POSTGRESQL_DB) pg = PostgreSQL(POSTGRESQL_DB) # The functions below are meant to connect to your database. class TweetAnalyzerCustom: # ID = ID of the tweet as str # tweet = { # 'date': '%a %b %d %H:%M:%S +0000 %Y', # 'user': { # 'id': user ID, # 'location': user location, # 'time zone': user time zone, # },