예제 #1
0
 def __init__(
     self,
     doc_loader,
     n_words,
     classify_tweets,
     minimum_gram_length,
     max_distance_entities_doc,
     doc_score_types,
 ):
     """Get out doc_analyzer, save the minimum score neccesary for docs
     and if the event detection module is turned on, initalize the class
     for that (spinup)"""
     self.n_words = n_words
     self.classify_tweets = classify_tweets
     self.es = Elastic(host=ELASTIC_HOST)
     self.check_toponym_index()
     self.pg = PostgreSQL('gfm')
     super().__init__(self.pg, self.es, doc_score_types,
                      max_distance_entities_doc)
     if self.classify_tweets == 'bert':
         self.text_classifier = TextClassifier()
     self.docs = {}
     doc_loader_args = (doc_score_types, n_words, minimum_gram_length)
     from doc_loader import DocLoaderES
     self.doc_loader = DocLoaderES(*doc_loader_args)
    array = np.arange(ysize * xsize).reshape((ysize, xsize))

    ds.SetGeoTransform(gt)
    ds.GetRasterBand(1).WriteArray(array)
    source = osr.SpatialReference()
    source.ImportFromEPSG(EPSG)
    ds.SetProjection(source.ExportToWkt())
    ds = None

if not os.path.exists(shp_file):
    subprocess.call(
        r"python C:\Users\jadeb\Anaconda3\Scripts\gdal_polygonize.py" +
        f" {tif_file} {shp_file}",
        shell=True)

pg = PostgreSQL('classification')

if not pg.table_exists(f'{RAINFALL_TYPE.lower()}_raster'):
    gdf = gpd.GeoDataFrame.from_file(shp_file)
    print('finished reading file')

    def x():
        for _ in range(ysize):
            for j in range(xsize):
                yield j

    def y():
        for i in range(ysize):
            for _ in range(xsize):
                yield i
import csv
import psycopg2
from psycopg2.extensions import AsIs
import pandas as pd
from db.postgresql import PostgreSQL
from db.elastic import Elastic
from config import LEVEL_2_COUNTRIES, PG_DB, DOCUMENT_INDEX, POSTGRESQL_USER

pd.options.mode.chained_assignment = None

TOWN_CODES = set([
    'PPL', 'PPLA', 'PPLA2', 'PPLA3', 'PPLA4', 'PPLC', 'PPLG', 'PPLR', 'PPLS',
    'PPLX', 'STLMT'
])

pg = PostgreSQL('gfm')
es = Elastic()


class Preprocess():
    def __init__(self):
        self.level_0_codes = self._load_level_0_codes()

    def _load_level_0_codes(self):
        gdf = gpd.GeoDataFrame.from_file(
            os.path.join('input', 'maps', 'level0.json'))
        return set(['g-' + geonameid for geonameid in gdf['geoNameId']])

    def get_location_type(self,
                          country_code,
                          feature_code,
예제 #4
0
    def load_docs(self,
                  docs_queue,
                  n_docs_to_unload,
                  start,
                  analysis_length,
                  timestep_length,
                  event_1,
                  event_2,
                  timestep_end_str,
                  is_real_time,
                  datetime=datetime):
        try:
            es = Elastic(host=ELASTIC_HOST)
            pg = PostgreSQL('gfm')
            doc_analyzer = DocAnalyzer(es, pg, self.doc_score_types,
                                       self.n_words, self.minimum_gram_length)
            spinup_start = start - analysis_length + timestep_length
            self.load_timestep_es(es, doc_analyzer, docs_queue,
                                  n_docs_to_unload, spinup_start, start)

            timestep = 1
            timestep_end = start + timestep * timestep_length

            while timestep_end < datetime.utcnow():
                query_start = timestep_end - timestep_length

                self.load_timestep_es(es, doc_analyzer, docs_queue,
                                      n_docs_to_unload, query_start,
                                      timestep_end)

                timestep_end_str.value = self.encode_dt(timestep_end)
                timestep += 1
                timestep_end = start + timestep * timestep_length

                event_2.clear()
                event_1.set()
                event_2.wait()

            last_timestep_end = timestep_end - timestep_length
            is_real_time.value = True

            while True:
                timestep_end = datetime.utcnow()

                sleep = (timedelta(minutes=3) -
                         (timestep_end - last_timestep_end)).total_seconds()
                if sleep > 0:
                    time.sleep(sleep)
                    timestep_end = datetime.utcnow()

                self.load_timestep_es(es, doc_analyzer, docs_queue,
                                      n_docs_to_unload, last_timestep_end,
                                      timestep_end)
                last_timestep_end = timestep_end
                timestep_end_str.value = self.encode_dt(timestep_end)

                event_2.clear()
                event_1.set()
                event_2.wait()
        except Exception as e:
            raise
예제 #5
0
# Name of the toponym resolution table
TOPONYM_RESOLUTION_TABLE = 'toponym_resolution_table'
# Refresh time of the realtime geotagging module
REAL_TIME_TAGGER_REFRESH_TIME = 300  # sec
# Name of the Elasticsearch index with tweets
TWEETS_INDEX = 'taggs'
# Name of the Elasticsearch index with toponyms
TOPONYM_INDEX = 'toponyms'

# Update tweets in the database with their locations (flag for testing purposes)
UPDATE = False

# Connect to databases
es_tweets = Elastic()
es_toponyms = es_tweets
pg_Geotag = PostgreSQL(POSTGRESQL_DB)
pg = PostgreSQL(POSTGRESQL_DB)


# The functions below are meant to connect to your database.
class TweetAnalyzerCustom:
    # ID = ID of the tweet as str
    # tweet = {
    #     'date': '%a %b %d %H:%M:%S +0000 %Y',
    #     'user': {
    #                     'id': user ID,
    #                     'location': user location,
    #                     'time zone': user time zone,
    #     },
    #     'text': text in utf-8 - retweeted_status if retweet, otherwise text
    #     'retweet': Boolean: True or False,
예제 #6
0
 def __init__(self):
     # Connect to PostgreSQL
     PostgreSQL.__init__(self, POSTGRESQL_DB)
     PostgreSQL.initialize_postgis(self)