예제 #1
0
 def __init__(
     self,
     doc_loader,
     n_words,
     classify_tweets,
     minimum_gram_length,
     max_distance_entities_doc,
     doc_score_types,
 ):
     """Get out doc_analyzer, save the minimum score neccesary for docs
     and if the event detection module is turned on, initalize the class
     for that (spinup)"""
     self.n_words = n_words
     self.classify_tweets = classify_tweets
     self.es = Elastic(host=ELASTIC_HOST)
     self.check_toponym_index()
     self.pg = PostgreSQL('gfm')
     super().__init__(self.pg, self.es, doc_score_types,
                      max_distance_entities_doc)
     if self.classify_tweets == 'bert':
         self.text_classifier = TextClassifier()
     self.docs = {}
     doc_loader_args = (doc_score_types, n_words, minimum_gram_length)
     from doc_loader import DocLoaderES
     self.doc_loader = DocLoaderES(*doc_loader_args)
예제 #2
0
def once_a_day():
    elastic = Elastic('elastic:9200', 'steam_tmp')
    log.info('Updating data from Steam API!')
    games = get_games_db()
    for game in games:
        game_id, game_name = int(game[0]), str(game[1])
        log.info('Starting the extraction of game: %s - %s', game_id,
                 game_name)
        try:
            gm = steam_api.get_game(game_id, 'temporal')
            log.info('Steam API: successed!')
            gm.update(steam_spy.get_game(game_id, 'temporal'))
            log.info('Steam SPY: successed!')
            gm.update(steam_currency.get_game(game_id, 'temporal'))
            log.info('Steam Currency: successed!')
            log.info('Starting insersion in the Elasticsearch')
            elastic.update(game_id, gm, 'game_tmp')
            log.info('Finishing insersion in the Elasticsearch')
        except Exception as error:
            if type(error) == GameNotFound:
                log.warning(error)
            else:
                log.error(error)
            time.sleep(300)
            games.append(game)
예제 #3
0
def insert_new_games():
    elastic = Elastic('elastic:9200', 'steam_est')
    log.info('Insert new games on Elasticsearch!')
    fail_id = open("ids_fails.txt", "a")
    lst1 = get_games_db()
    lst2 = get_all_games()
    games = [game for game in lst2 if game not in lst1]
    for game in games:
        game_id, game_name = int(game[0]), str(game[1])
        log.info('Starting the extraction of game: %s - %s', game_id,
                 game_name)
        try:
            game = steam_api.get_game(game_id, 'estastic')
            log.info('Steam API: successed!')
            game.update(steam_spy.get_game(game_id, 'estastic'))
            log.info('Steam SPY: successed!')
            log.info('Starting insersion in the Elasticsearch')
            elastic.update(game_id, game, 'game_est')
            log.info('Finishing insersion in the Elasticsearch')
        except Exception as error:
            if type(error) == GameNotFound:
                log.warning(error)
            else:
                log.error(error)
            time.sleep(300)
            fail_id.write(str(game_id) + " || " + str(game_name) + "\n")
예제 #4
0
class Fill:
    def __init__(self):
        self.keywords = self.set_keywords()
        self.es = Elastic()

    def set_keywords(self):
        df = pd.read_excel('input/twitter_supported_languages.xlsx')
        df = df[df['implemented'] == True].set_index(
            'language_code')['floods_filtered']
        keywords = {}
        for language, words in df.iteritems():
            keywords[language] = set(
                [word.strip().lower() for word in words.split(',')])
        return keywords

    def generate_tweets(self, fp, start=datetime(1970, 1, 1)):
        with open(fp, 'rb') as f:
            for tweet in f.readlines():
                tweet = tweet.strip()
                try:
                    tweet = json.loads(tweet)
                except json.decoder.JSONDecodeError:
                    continue
                try:
                    language = tweet['lang']
                except KeyError:
                    continue
                clean_text = sanitize.clean_text(tweet['text'], lower=False)
                clean_text_lower = clean_text.lower()
                try:
                    if not any(keyword in clean_text_lower
                               for keyword in self.keywords[language]):
                        continue
                except KeyError:
                    continue
                yield tweet

    def prepare_doc(self, json_doc):
        doc2es = tweet_parser(json_doc)
        doc2es['_index'] = DOCUMENT_INDEX
        doc2es['_id'] = doc2es['id']
        doc2es['_type'] = '_doc'
        doc2es['source']['type'] = 'tweet'
        return doc2es

    def prepare_docs(self, docs):
        for doc in docs:
            doc2es = self.prepare_doc(doc)
            if doc2es:
                yield doc2es

    def commit_docs(self, docs):
        self.es.bulk_operation(docs)

    def __call__(self, fp):
        tweets = self.generate_tweets(fp)
        tweets = self.prepare_docs(tweets)
        self.commit_docs(tweets)
예제 #5
0
def gzip_to_es(move_per=10000):
    es = Elastic()

    def get_labels():
        with gzip.open('tweets_labelled.gz', 'rt', encoding='utf-8') as f:
            for line in f.readlines():
                ID, label = line.strip().split('\t')
                yield ID, label

    def move_to_db(labels):
        es_update = []
        for ID, label in labels:
            es_update.append({
                'doc': {
                    'event_related': True if label == 'yes' else False
                },
                '_index': DOCUMENT_INDEX,
                '_id': ID,
                '_op_type': 'update',
            })

        es.bulk_operation(es_update)

    for i, labels in enumerate(chunker(get_labels(), move_per)):
        print(i)
        move_to_db(labels)
예제 #6
0
def export():
    es = Elastic()

    query = {}
    tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query)
    n = 1
    with gzip.open('tweets.gz', 'wt', encoding='utf-8') as f:
        for tweet in tweets:
            if not n % 1000:
                print(f"{n} - {datetime.now()}")
            tweet = tweet['_source']
            if 'locations' in tweet:
                n += 1
                ID = tweet['id']
                text = clean_text(tweet['text'], lower=False)
                f.write(f'{ID}\t{text}\n')
예제 #7
0
def once_a_week():
    elastic = Elastic('elastic:9200', 'steam_tmp')
    log.info('Updating data omce a week!')
    games = get_games_db()
    for game in games:
        log.info('Starting the extraction of game: %s - %s', game[0], game[1])
        try:
            gm = youtube_api.get_game(str(game[1]), 'temporal')
            log.info('Youtube API: successed!')
            log.info('Starting update in the Elasticsearch')
            elastic.update(int(game[0]), gm, 'game_tmp')
            log.info('Finishing update in the Elasticsearch')
        except Exception as error:
            if type(error) == GameNotFound:
                log.warning(error)
            else:
                log.error(error)
        time.sleep(3600)
        games.append(game)
예제 #8
0
    def analyze_tweets_subbasin(self, subbasin, languages=None):
        from db.elastic import Elastic
        es = Elastic()
        query = {
            'query': {
                'term': {
                    'locations.subbasin_ids_9': subbasin
                }
            },
            'sort': {
                'date': 'asc'
            }
        }

        data = []
        tweets = es.scroll_through(index='floods_all',
                                   body=query,
                                   source=False)
        for tweet in tweets:
            detailed_locations = [
                loc for loc in tweet['locations']
                if loc['type'] in ('town', 'adm5', 'adm4', 'adm3', 'landmark')
            ]
            if len(detailed_locations) != 1:
                continue

            detailed_location = detailed_locations[0]
            if subbasin not in detailed_location['subbasin_ids_9']:
                continue

            if detailed_location['score'] < .2:
                continue

            tweet_lang = tweet['source']['lang']
            if languages and tweet_lang not in languages:
                continue

            data.append((subbasin, tweet['id'], tweet['date'], tweet['text'],
                         tweet_lang, None))
        self.process(data, res_file=subbasin, include_context='hydrology')
예제 #9
0
def try_fails_id():
    elastic = Elastic('elastic:9200', 'steam_est')
    log.info('Trying insert the fails ids again!')
    games = open("ids_fails.txt", "r")
    for game in games:
        game_id, game_name = game.split(" || ")
        game_id = int(game_id)
        log.info('Starting the extraction of game: %s - %s', game_id,
                 game_name)
        try:
            game = steam_api.get_game(game_id, 'estastic')
            log.info('Steam API: successed!')
            game.update(steam_spy.get_game(game_id, 'estastic'))
            log.info('Steam SPY: successed!')
            log.info('Starting insersion in the Elasticsearch')
            elastic.update(game_id, game, 'game_est')
            log.info('Finishing insersion in the Elasticsearch')
        except Exception as error:
            if type(error) == GameNotFound:
                log.warning(error)
            else:
                log.error(error)
            time.sleep(300)
    os.remove("ids_fails.txt")
예제 #10
0
class Fill:
    def __init__(self):
        self.keywords = self.set_keywords()
        self.es = Elastic()

    def set_keywords(self):
        df = pd.read_excel('input/twitter_supported_languages.xlsx')
        df = df[df['implemented'] == True].set_index(
            'language_code')['floods_filtered']
        keywords = {}
        for language, words in df.iteritems():
            keywords[language] = set(
                [word.strip().lower() for word in words.split(',')])
        return keywords

    def open(self, fp):
        if fp.endswith('.gzip') or fp.endswith('.gz'):
            with gzip.open(fp, 'r') as gz:
                for tweet in gz:
                    yield tweet.decode('utf-8')
        elif fp.endswith('.jsonl'):
            with open(fp, 'rb') as f:
                for line in f.readlines():
                    if line.startswith('#'):
                        continue
                    yield line.strip()
        else:
            raise NotImplementedError(
                f'reader for extension {fp.split(".")[-1]} not implemented')

    def generate_tweets(self, fp, start=datetime(1970, 1, 1)):
        for tweet in self.open(fp):
            try:
                tweet = json.loads(tweet)
            except json.decoder.JSONDecodeError:
                continue
            yield tweet

    def prepare_doc(self, json_doc):
        if 'limit' in json_doc:
            return None
        doc2es = tweet_parser(json_doc)
        if not doc2es:
            return None
        language = doc2es['source']['lang']
        clean_text = sanitize.clean_text(doc2es['text'], lower=False)
        clean_text_lower = clean_text.lower()
        try:
            if not any(keyword in clean_text_lower
                       for keyword in self.keywords[language]):
                return None
        except KeyError:
            return None
        doc2es['_index'] = DOCUMENT_INDEX
        doc2es['_id'] = doc2es['id']
        doc2es['source']['type'] = 'tweet'
        return doc2es

    def prepare_docs(self, docs):
        for doc in docs:
            doc2es = self.prepare_doc(doc)
            if doc2es:
                yield doc2es

    def commit_docs(self, docs):
        self.es.bulk_operation(docs)

    def __call__(self, fp):
        tweets = self.generate_tweets(fp)
        tweets = self.prepare_docs(tweets)
        self.commit_docs(tweets)
예제 #11
0
class Detection(Geoparser):
    def __init__(
        self,
        doc_loader,
        n_words,
        classify_tweets,
        minimum_gram_length,
        max_distance_entities_doc,
        doc_score_types,
    ):
        """Get out doc_analyzer, save the minimum score neccesary for docs
        and if the event detection module is turned on, initalize the class
        for that (spinup)"""
        self.n_words = n_words
        self.classify_tweets = classify_tweets
        self.es = Elastic(host=ELASTIC_HOST)
        self.check_toponym_index()
        self.pg = PostgreSQL('gfm')
        super().__init__(self.pg, self.es, doc_score_types,
                         max_distance_entities_doc)
        if self.classify_tweets == 'bert':
            self.text_classifier = TextClassifier()
        self.docs = {}
        doc_loader_args = (doc_score_types, n_words, minimum_gram_length)
        from doc_loader import DocLoaderES
        self.doc_loader = DocLoaderES(*doc_loader_args)

    def check_toponym_index(self):
        if not self.es.indices.exists("locations"):
            print("Toponym index does not exist")
            sys.exit()

    def maybe_set_table_name(self, regions, detection_parameters):
        for i, setting in enumerate(detection_parameters):
            if setting.run_name is None:
                run_name = ("floods"
                            f"_{regions}"
                            f"_{int(setting.location_threshold*100)}"
                            f"_{setting.factor}_{int(setting.fraction*10)}"
                            f"_{setting.base}")
                detection_parameters[i] = setting._replace(run_name=run_name)

    def initial_detection(
        self,
        start,
        end,
    ):
        print("Initial detection")
        for query_start, query_end in daterange(start,
                                                end,
                                                timedelta(days=1),
                                                ranges=True):
            query_end = min(query_end, end)
            print("Initial detection:", query_start, "-", query_end)
            query = self.es.build_date_query(
                query_start,
                query_end,
                locations=True,
            )
            query['query']['bool']['must'].append(
                {'term': {
                    'event_related': True
                }})
            documents = self.es.scroll_through(index=DOCUMENT_INDEX,
                                               body=query,
                                               source=False)
            self.event_detector.detect_events_l(documents,
                                                is_real_time=mp.Value(
                                                    c_bool, False),
                                                convert_to_named_tuple=True)
        print("Finished initial detection")

    def run(
        self,
        start,
        spinup_time,
        timestep_length,
        analysis_length,
        detection_parameters,
        regions,
        real_time,
        max_n_docs_in_memory=None,
        check_previous_docs=True,
        geoparsing_start=False,
        update_locations=True,
        end=False,
        load_detectors=False,
        detection=True,
    ):
        """This program uses 2 processes. The main process (this one) that
        analyzes groups of docs and detects based on this. In addition a
        child process is spawned that reads the docs from the database or
        receives them from a stream. This process is the doc_loader.
        Two events, event_1 and event_2, regulate the execution of both
        processes. First the doc_loader loads the docs used for the spinup
        from the database, then the docs for the first timestep, which are
        all put in a queue (docs_queue). Then this one of the events is
        released, while the doc_loader is paused. The execution of the main
        process is restarted. First it unloads the docs from the docs_queue
        and releases the doc_loader again. This process then iterates."""
        if not update_locations:
            print("WARNING: Not updating locations")

        # Check if timestep not bigger than analysis length
        if timestep_length > analysis_length:
            print("Timestep too big")
            sys.exit(0)

        # Set parameters for sharing between processes
        n_docs_to_unload = Counter(0)
        timestep_end_str = mp.Array('c', 26)
        docs_queue = mp.Queue()
        event_1 = mp.Event()
        event_2 = mp.Event()
        is_real_time = mp.Value(c_bool, False)

        end_date_spinup = start + spinup_time
        if geoparsing_start:
            if geoparsing_start < start:
                print("ERROR: Geoparsing start is smaller than start date")
                sys.exit()
            geoparsing_start = int((geoparsing_start - start) /
                                   timestep_length) * timestep_length + start
            print("Geoparsing start:", geoparsing_start)
            doc_loader_start = geoparsing_start
        else:
            doc_loader_start = start

        doc_loader_mp = Process(target=self.doc_loader.load_docs,
                                args=(docs_queue, n_docs_to_unload,
                                      doc_loader_start, analysis_length,
                                      timestep_length, event_1, event_2,
                                      timestep_end_str, is_real_time))
        doc_loader_mp.daemon = True
        doc_loader_mp.start()

        if detection and geoparsing_start and geoparsing_start > end_date_spinup:
            self.event_detector = EventDetector(
                self.pg,
                self.es,
                start,
                spinup_time,
                detection_parameters=detection_parameters,
                regions=regions,
                load_detectors=load_detectors,
            )
            self.initial_detection(start, geoparsing_start)
            end_date_spinup = None

        while True and (real_time or not is_real_time.value):
            event_1.wait()
            if doc_loader_mp.exception is not None:
                _, traceback = doc_loader_mp.exception
                print(traceback)
                sys.exit()

            unloaded_docs = []
            for i in range(n_docs_to_unload.value()):
                unloaded_docs.append(docs_queue.get())
                n_docs_to_unload.decrease()

            if self.classify_tweets == 'bert':
                about_ongoing_event_docs = []
                about_ongoing_event_doc_ids = set()
                classified_docs = set()

                # Check whether documents are already classified in ES. If so, load classification from ES.
                if unloaded_docs:
                    documents = self.es.mget(
                        index=DOCUMENT_INDEX,
                        body={'ids': [ID for ID, _ in unloaded_docs]})['docs']
                for doc in documents:
                    doc = doc['_source']
                    if 'event_related' in doc:
                        classified_docs.add(doc['id'])
                        if doc['event_related'] is True:
                            about_ongoing_event_doc_ids.add(doc['id'])

                for doc in unloaded_docs:
                    if doc[0] in about_ongoing_event_doc_ids:
                        about_ongoing_event_docs.append(doc)

                docs_to_classify = []
                examples_to_classify = []
                for doc in unloaded_docs:
                    ID, doc_info = doc
                    if ID not in classified_docs:
                        example = {
                            'id': ID,
                            'sentence1': doc_info.clean_text,
                            'label': 0
                        }
                        examples_to_classify.append(example)
                        docs_to_classify.append(doc)

                classes = self.text_classifier(examples_to_classify)

                assert len(classes) == len(docs_to_classify)
                es_update = []
                for doc_class, doc in zip(classes, docs_to_classify):
                    doc_class = True if doc_class == 'yes' else False
                    if doc_class is True:
                        about_ongoing_event_docs.append(doc)
                    es_update.append({
                        'doc': {
                            'event_related': doc_class
                        },
                        '_index': DOCUMENT_INDEX,
                        '_id': doc[0],
                        '_op_type': 'update',
                    })

                self.es.bulk_operation(es_update)

                about_ongoing_event_docs = sorted(about_ongoing_event_docs,
                                                  key=lambda x: x[1].date,
                                                  reverse=False)

                self.docs.update(dict(about_ongoing_event_docs))
            elif self.classify_tweets == 'db':
                # Check whether documents are already classified in ES. If so, load classification from ES.
                about_ongoing_event_docs = []
                if unloaded_docs:
                    documents = self.es.mget(
                        index=DOCUMENT_INDEX,
                        body={'ids': [ID for ID, _ in unloaded_docs]})['docs']
                    for doc in documents:
                        doc = doc['_source']
                        if doc['event_related'] is True:
                            about_ongoing_event_doc_ids.add(doc['id'])

                    for doc in unloaded_docs:
                        if doc[0] in about_ongoing_event_doc_ids:
                            about_ongoing_event_docs.append(doc)
                self.docs.update(dict(about_ongoing_event_docs))
            else:
                self.docs.update(dict(unloaded_docs))

            if max_n_docs_in_memory is not None and len(
                    self.docs) > max_n_docs_in_memory:
                n_docs_to_delete = len(self.docs) - max_n_docs_in_memory
                IDs_to_remove = list(self.docs.keys())[:n_docs_to_delete]
                for ID in IDs_to_remove:
                    del self.docs[ID]

            event_1.clear()
            event_2.set()
            near_end_date_spinup = False
            if self.docs:
                timestep_end = str(timestep_end_str.value, 'utf-8')
                timestep_end = isoformat_2_date(timestep_end)
                l_docs = []

                if detection and end_date_spinup and timestep_end >= end_date_spinup:
                    self.event_detector = EventDetector(
                        self.pg,
                        self.es,
                        start,
                        spinup_time,
                        detection_parameters=detection_parameters,
                        load_detectors=load_detectors,
                        regions=regions)
                    self.initial_detection(start,
                                           timestep_end - analysis_length)
                    near_end_date_spinup = True

                for ID, doc in self.docs.items():
                    if doc.date > timestep_end - analysis_length:
                        break
                    else:
                        l_docs.append(ID)

                for i, ID in enumerate(l_docs):
                    l_docs[i] = self.docs[ID]
                    del self.docs[ID]

                self.geoparse_timestep(timestep_end,
                                       update_locations=update_locations)
                if detection and not end_date_spinup and (
                        not geoparsing_start
                        or timestep_end > geoparsing_start + analysis_length):
                    self.event_detector.detect_events_l(
                        l_docs, is_real_time=is_real_time)
                    self.event_detector.detect_events_s(
                        self.docs.values(), is_real_time=is_real_time)
                if near_end_date_spinup:
                    end_date_spinup = None

                if end and timestep_end > end:
                    return None
예제 #12
0
    def load_docs(self,
                  docs_queue,
                  n_docs_to_unload,
                  start,
                  analysis_length,
                  timestep_length,
                  event_1,
                  event_2,
                  timestep_end_str,
                  is_real_time,
                  datetime=datetime):
        try:
            es = Elastic(host=ELASTIC_HOST)
            pg = PostgreSQL('gfm')
            doc_analyzer = DocAnalyzer(es, pg, self.doc_score_types,
                                       self.n_words, self.minimum_gram_length)
            spinup_start = start - analysis_length + timestep_length
            self.load_timestep_es(es, doc_analyzer, docs_queue,
                                  n_docs_to_unload, spinup_start, start)

            timestep = 1
            timestep_end = start + timestep * timestep_length

            while timestep_end < datetime.utcnow():
                query_start = timestep_end - timestep_length

                self.load_timestep_es(es, doc_analyzer, docs_queue,
                                      n_docs_to_unload, query_start,
                                      timestep_end)

                timestep_end_str.value = self.encode_dt(timestep_end)
                timestep += 1
                timestep_end = start + timestep * timestep_length

                event_2.clear()
                event_1.set()
                event_2.wait()

            last_timestep_end = timestep_end - timestep_length
            is_real_time.value = True

            while True:
                timestep_end = datetime.utcnow()

                sleep = (timedelta(minutes=3) -
                         (timestep_end - last_timestep_end)).total_seconds()
                if sleep > 0:
                    time.sleep(sleep)
                    timestep_end = datetime.utcnow()

                self.load_timestep_es(es, doc_analyzer, docs_queue,
                                      n_docs_to_unload, last_timestep_end,
                                      timestep_end)
                last_timestep_end = timestep_end
                timestep_end_str.value = self.encode_dt(timestep_end)

                event_2.clear()
                event_1.set()
                event_2.wait()
        except Exception as e:
            raise
예제 #13
0
                },
                "publishers": {
                    "type": "keyword",
                    "store": "true"
                },
                "platforms": {
                    "type": "keyword",
                    "store": "true"
                },
            }
        },
    }
}

try:
    elastic = Elastic('elastic:9200', 'steam_est')
    log.info('Elasticsearch connected')
    log.info('Creating index Steam Estastic on Elasticsearch')
    elastic.create_index(index_body)
    log.info('Index Steam Created')
    games = get_all_games()
    log.debug(len(games))
    for game in games:
        game_id, game_name = int(game[0]), str(game[1])
        log.info('Starting the extraction of game: %s - %s', game_id,
                 game_name)
        try:
            game = steam_api.get_game(game_id, 'estastic')
            log.info('Steam API: successed!')
            game.update(steam_spy.get_game(game_id, 'estastic'))
            log.info('Steam SPY: successed!')
예제 #14
0
from db.elastic import Elastic
from config import DOC_SCORE_TYPES, DOCUMENT_INDEX

es = Elastic()
es.maybe_create_document_index(DOCUMENT_INDEX, DOC_SCORE_TYPES)
예제 #15
0
def classify():
    es = Elastic()

    classify_per = 10_000

    if refresh:
        remove_field_from_index(DOCUMENT_INDEX, 'event_related')

    predictor = Predictor()

    query = {
        'query': {
            "bool": {
                "must": [
                    {
                        'exists': {
                            'field': 'locations'
                        }
                    }
                ],
                "must_not": {
                    'exists': {
                        'field': 'event_related'
                    }
                }
            }
        }
    }
    n = es.n_hits(index=DOCUMENT_INDEX, body=query)
    tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query)
    tweet_subset = []
    for i, tweet in enumerate(tweets):
        if not i % classify_per:
            print(f"{i}/{n} ({int(i/n*100)}%) - {datetime.now()}")
        tweet_subset.append(tweet)

        if len(tweet_subset) == classify_per:
            IDs = []
            examples = []
            for tweet in tweet_subset:
                tweet = tweet['_source']
                IDs.append(tweet['id'])
                example = {
                    "id": tweet['id'],
                    "sentence1": clean_text(tweet['text'], lower=False),
                    "label": 0
                }
                examples.append(example)

            labels = predictor(examples)
            es_update = []
            for ID, label in zip(IDs, labels):
                es_update.append({
                    'doc': {
                        'event_related': True if label == 'yes' else False
                    },
                    '_index': DOCUMENT_INDEX,
                    '_id': ID,
                    '_op_type': 'update',
                })

            es.bulk_operation(es_update)

            tweet_subset = []
예제 #16
0
 def __init__(self):
     self.keywords = self.set_keywords()
     self.es = Elastic()
예제 #17
0
from db.elastic import Elastic
import sys

es = Elastic()


def remove_field_from_index(index, field):
    body = {
        "query": {
            "bool": {
                "must": [
                    {
                        "exists": {"field": field}
                    }
                ]
            }
        }
    }
    print(f"removing {es.n_hits(index=index, body=body)} documents from index '{index}'")
    body.update({
        "script": {
            "inline": f"ctx._source.remove(\"{field}\")"
        }
    })
    es.update_by_query(index=index, body=body, conflicts='proceed')


if __name__ == '__main__':
    remove_field_from_index(sys.argv[-2], sys.argv[-1])
예제 #18
0
# Name of the PostgreSQL database (lowercase)
POSTGRESQL_DB = 'taggs'
# Name of the toponym resolution table
TOPONYM_RESOLUTION_TABLE = 'toponym_resolution_table'
# Refresh time of the realtime geotagging module
REAL_TIME_TAGGER_REFRESH_TIME = 300  # sec
# Name of the Elasticsearch index with tweets
TWEETS_INDEX = 'taggs'
# Name of the Elasticsearch index with toponyms
TOPONYM_INDEX = 'toponyms'

# Update tweets in the database with their locations (flag for testing purposes)
UPDATE = False

# Connect to databases
es_tweets = Elastic()
es_toponyms = es_tweets
pg_Geotag = PostgreSQL(POSTGRESQL_DB)
pg = PostgreSQL(POSTGRESQL_DB)


# The functions below are meant to connect to your database.
class TweetAnalyzerCustom:
    # ID = ID of the tweet as str
    # tweet = {
    #     'date': '%a %b %d %H:%M:%S +0000 %Y',
    #     'user': {
    #                     'id': user ID,
    #                     'location': user location,
    #                     'time zone': user time zone,
    #     },