Exemplo n.º 1
0
class JsonPost:
    """Handles the data to find duplicates."""

    logger = LoggerFactory.get_enhancement_logger()

    def __init__(self, json):
        """Constructor of JsonPost."""

        self.json = json
        self.mod_json = self.__remove_not_needed_data(deepcopy(json))
        self.is_duplicate = False

    @staticmethod
    def __remove_not_needed_data(json):
        """Removes data from json which should not be use in the comparison."""
        JsonPost.logger.debug("__remove_not_needed_data()")

        if 'link' in json:
            del json['link']
        if 'source' in json:
            del json['source']
        if 'id' in json:
            del json['id']
        if 'post_struct' in json:
            del json['post_struct']

        return json
    def __init__(self, name):
        """Constructor of the scraper."""

        # Scraper name -> Overwritten by name of the scraper file
        self.name = name

        # The URLs which will be parsed and scraped
        self.urls = []

        # An error object to keep track of error occurences (which is used for logging)
        self.errors = []

        # logger for Scraper
        self.logger = LoggerFactory.get_logger(name)

        # start time of logger
        self.start = None
    def __init__(self, data, domain_name):
        """Constructor of Enhancer."""
        self.__data = data
        self.__domain_name = domain_name
        self.logger = LoggerFactory.get_enhancement_logger()

        # link function containing domain specific enhancement to said domain here
        # 'domain' : self.__enhance_domain_function_name
        # make sure to enter 'self.__enhance_domain_function_name' and not
        # 'self.__enhance_domain_function_name()' as brackets would make this
        # a function call instead of a reference to the function
        self.__function_map = {
            'ehrenamt_hessen': self.__enhance_ehrenamt_hessen,
            'weltwaerts': self.__enhance_weltwaerts,
            'gutetat_berlin': self.__enhance_gute_tat,
            'gutetat_hamburg': self.__enhance_gute_tat,
            'gutetat_munich': self.__enhance_gute_tat,
            'ein_jahr_freiwillig': self.__enhance_ein_jahr_freiwillig,
            'bundesfreiwilligendienst':
            self.__enhance_bundesfreiwilligendienst,
        }
Exemplo n.º 4
0
from shared.LoggerFactory import LoggerFactory

logger = LoggerFactory.get_enhancement_logger()


def add_map_address(data):
    """ dummy function for adding map_adress """
    logger.debug("add_map_address()")

    for post in data:
        post['post_struct']['map_address'] = ''
Exemplo n.º 5
0
# Root Directory (/etl)
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
os.environ['ROOT_DIR'] = ROOT_DIR

# Adds path of the data extraction modules
sys.path.extend([f'{ROOT_DIR}/data_extraction', f'{ROOT_DIR}/shared'])

from data_enhancement import enhance_data as enhance_data
from data_extraction.scrape_data import run as run_extraction
from shared.utils import write_data_to_json, read_data_from_json
from shared.LoggerFactory import LoggerFactory
from data_management.DataManager import DataManager


logger = LoggerFactory.get_general_logger()
# Runs the extraction process and writes the scraped data to data_extraction/data directory
run_extraction()

for file in os.scandir(os.path.join(ROOT_DIR, 'data_extraction/data')):
    file_name = os.path.splitext(file.name)[0]
    # read scraped data for enhancement
    data = read_data_from_json(file.path)

    # Enhance data
    enhanced_data = enhance_data.Enhancer(data, file_name).run()

    # Write enhanced data to files
    write_data_to_json(os.path.join(ROOT_DIR, 'data_enhancement/data', f'{file_name}.json'), enhanced_data)

DataManager.run_backup_process()
Exemplo n.º 6
0
class DataManager:
    """ Class that collects the enhanced data from the scraping process, manages backups and composes the data that is
     will be uploaded to elasticsearch
     - After the enhancement process, the results are stored in a backup folder named after the time of the backup
     - the upload folder contains the data that will be uploaded to elasticsearch

     For the upload data, the most recent backup data is selected. The amount of posts in the selected dataset is
     compared against older backups according to the value set in fallback_depth. A fallback_depth of 2 means that the
     data from the last 2 backups that are older than the current selected backup is compared with the upload data.
     If the current dataset for the upload contains less than X% of the posts in the backup dataset, where X is the
     defined threshold, the backup dataset is selected instead. This is done on a file by file basis, meaning the
     upload can contain a mixture of files from different backups.

     The source of all files in the upload folder at the end of the process gets logged."""

    # manages how many backups are to be kept. if the number of existing backups would exceed this threshold, the oldest
    # backup gets deleted
    max_number_of_backups = 7

    # manages how many backups into the past should be considered for the upload
    fallback_depth = 3

    # defines the percentage threshold at which data from an older backup may be used. data from an older backup may be
    # used if the current data selected for upload contains less than [threshold] * [number of posts in backup], the
    # data from the backup is selected for upload instead
    threshold = 0.75

    enhanced_data_location = os.path.join(ROOT_DIR, 'data_enhancement/data')
    backup_directory = os.path.join(ROOT_DIR, 'data_management', 'backups')
    upload_directory = os.path.join(ROOT_DIR, 'data_management', 'upload')
    file_upload_data_origin = os.path.join(ROOT_DIR, 'logs',
                                           'upload_data_origin.log')

    mask_timestamp = '%d.%m.%Y'

    logger = LoggerFactory.get_datamanagement_logger()

    data_origin = dict()

    @staticmethod
    def timestamp_to_datestring(timestamp):
        """Converts unix timestamp into datestring"""
        DataManager.logger.debug("timestamp_to_datestring()")

        return datetime.datetime.fromtimestamp(timestamp).strftime(
            DataManager.mask_timestamp)

    @staticmethod
    def datestring_to_timestamp(datestring):
        """Converts datestring into unix timestamp"""
        DataManager.logger.debug("datestring_to_timestamp()")

        return time.mktime(
            datetime.datetime.strptime(datestring,
                                       DataManager.mask_timestamp).timetuple())

    @staticmethod
    def save_upload_data_origin(upload_data_origin):
        """Saves the information about the origin of the data inside the upload folder into a text file"""
        file = open(DataManager.file_upload_data_origin, 'w', encoding='utf-8')
        file.write(
            f"last upload: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n"
        )
        file.write(f"Source for upload data:")
        file.write(upload_data_origin)
        file.close()

    @staticmethod
    def copy_from_backup(backup):
        """Copies all the files from a backup into the upload folder and documents the files origin"""
        DataManager.logger.debug("copy_from_backup()")

        path_backup = os.path.join(DataManager.backup_directory, backup)
        for file in os.listdir(path_backup):
            shutil.copy(os.path.join(path_backup, file),
                        os.path.join(DataManager.upload_directory, file))
            DataManager.data_origin[file] = backup

    @staticmethod
    def backup_current_data():
        """Creates a backup for the data in einander-helfen/etl/data_enhancement/data with current date as timestamp"""
        DataManager.logger.debug("backup_current_data()")

        backup_location = os.path.join(
            DataManager.backup_directory,
            DataManager.timestamp_to_datestring(time.time()))

        if os.path.exists(backup_location):
            DataManager.logger.warning(
                "There already exists a backup from today, deleting old backup"
            )
            shutil.rmtree(backup_location)
        os.makedirs(backup_location)

        enhancement_files = os.listdir(DataManager.enhanced_data_location)
        for file in enhancement_files:
            enhancement_file = os.path.join(DataManager.enhanced_data_location,
                                            file)
            if os.path.isfile(enhancement_file):
                shutil.copy(enhancement_file, backup_location)

    @staticmethod
    def get_sorted_list_of_backups():
        """ returns a list containing all the backup folders in a sorted order from old to new"""
        DataManager.logger.debug("get_sorted_list_of_backups()")

        backups = os.listdir(DataManager.backup_directory)
        backup_timestamps = list()
        for folder in backups:
            backup_timestamps.append(
                DataManager.datestring_to_timestamp(folder))
        backup_timestamps.sort()

        sorted_filenames = list()
        for timestamp in backup_timestamps:
            sorted_filenames.append(
                DataManager.timestamp_to_datestring(timestamp))
        return sorted_filenames

    @staticmethod
    def remove_old_backups():
        """Checks if the backup folder contains more than the maximum of set backups and deletes surplus"""
        DataManager.logger.debug("remove_old_backups()")

        backups = DataManager.get_sorted_list_of_backups()
        if len(backups) > DataManager.max_number_of_backups:
            DataManager.logger.info(
                f"More than {DataManager.max_number_of_backups} backups exist({len(backups)})"
                f", deleting {len(backups)- DataManager.max_number_of_backups} backup(s)"
            )
            for file in backups[:len(backups) -
                                DataManager.max_number_of_backups]:
                DataManager.logger.info(f"Deleting backup {file}")
                shutil.rmtree(os.path.join(DataManager.backup_directory, file))

    @staticmethod
    def clear_upload():
        """Clears the upload folder as preparation for the fresh upload data"""
        DataManager.logger.debug("clear_upload()")

        shutil.rmtree(DataManager.upload_directory)
        os.makedirs(DataManager.upload_directory)

    @staticmethod
    def get_eligible_backups():
        """Returns list of backups that are eligible as a fallback"""
        DataManager.logger.debug("get_eligible_backups()")

        if len(DataManager.get_sorted_list_of_backups()
               ) < DataManager.fallback_depth:
            DataManager.fallback_depth = len(
                DataManager.get_sorted_list_of_backups())
        return DataManager.get_sorted_list_of_backups(
        )[-DataManager.fallback_depth - 1:]

    @staticmethod
    def initialise_upload_data(backups):
        """Copies files from all backups within fallback depth into upload folder, the most recent backup is the last to
        get copied. As a result, upload now contains all files from the most recent scrape and any additional files
        from older backups within fallback range."""
        DataManager.logger.debug("initialise_upload_data()")

        for backup_folder in backups[-DataManager.fallback_depth - 1:]:
            DataManager.copy_from_backup(backup_folder)

    @staticmethod
    def build_string_data_origin():
        """Builds string with summary of which backup files in the upload folder are taken from"""
        DataManager.logger.debug("build_string_data_origin()")

        max_length = 0
        for entry in DataManager.data_origin:
            if len(entry) > max_length:
                max_length = len(entry)
        string_data_origin = ""
        for entry in DataManager.data_origin:
            string_data_origin = string_data_origin+"\n" + \
                                 f"{entry.rjust(max_length)} : {DataManager.data_origin[entry]}"
        return string_data_origin

    @staticmethod
    def compose_upload():
        """Composes the upload according to the general behaviour described for this class and the set parameters"""
        DataManager.logger.debug("compose_upload()")

        DataManager.clear_upload()
        eligible_backups = DataManager.get_eligible_backups()
        DataManager.initialise_upload_data(eligible_backups)
        eligible_backups = eligible_backups[:-1]  # ignore most recent backup

        for backup in eligible_backups:
            for upload_file in os.listdir(DataManager.upload_directory):
                if os.path.isfile(
                        os.path.join(DataManager.backup_directory, backup,
                                     upload_file)):
                    data_in_upload = read_data_from_json(
                        os.path.join(DataManager.upload_directory,
                                     upload_file))
                    data_in_backup = read_data_from_json(
                        os.path.join(DataManager.backup_directory, backup,
                                     upload_file))

                    if len(data_in_upload
                           ) < DataManager.threshold * len(data_in_backup):
                        DataManager.logger.info(
                            f"{upload_file} contains less than 75% of the posts in backup "
                            f"'{backup}' ({len(data_in_upload)} posts vs {len(data_in_backup)} "
                            f"posts). Current data for {upload_file} will be replaced with backup "
                            f"data")
                        write_data_to_json(
                            os.path.join(DataManager.upload_directory,
                                         upload_file), data_in_backup)
                        DataManager.data_origin[upload_file] = backup
        upload_data_origin = DataManager.build_string_data_origin()
        DataManager.save_upload_data_origin(upload_data_origin)
        DataManager.logger.info(
            f"Source for upload data: {upload_data_origin}")

    @staticmethod
    def init():
        """Sets up the required folders and corrects set parameters if needed"""
        DataManager.logger.debug("init()")

        if not os.path.exists(DataManager.backup_directory):
            DataManager.logger.info("Creating backup directory")
            os.makedirs(DataManager.backup_directory)
        if not os.path.exists(DataManager.upload_directory):
            DataManager.logger.info("Creating upload directory")
            os.makedirs(DataManager.upload_directory)

        if DataManager.fallback_depth > DataManager.max_number_of_backups:
            DataManager.logger.warning(
                f"fallback depth exceeds maximal number of backups ("
                f"{DataManager.fallback_depth} > {DataManager.max_number_of_backups}), "
                f"fallback depth will be limited to number of backups")
            DataManager.fallback_depth = DataManager.max_number_of_backups

    @staticmethod
    def run_backup_process():
        """Runs the datamangement process for creating backups"""
        DataManager.logger.debug("run_backup_process()")

        DataManager.init()
        DataManager.backup_current_data()
        DataManager.remove_old_backups()

    @staticmethod
    def run_compose_upload_process():
        """Runs the datamangement process for composing the upload"""
        DataManager.logger.debug("run_compose_upload_process()")

        DataManager.init()
        DataManager.compose_upload()
import hashlib
import json
import os

from elasticsearch import Elasticsearch

from shared.utils import read_data_from_json
from shared.LoggerFactory import LoggerFactory

ROOT_DIR = os.environ['ROOT_DIR']
client = Elasticsearch([{'host': '127.0.0.1', 'port': 9200}])
logger = LoggerFactory.get_elastic_logger()


def run_elastic_upload():
    logger.debug("run_elastic_upload")
    logger.info("Starting Index Process!")

    index = 'posts'
    if client.indices.exists(index=index):
        client.indices.delete(index=index, ignore=[400, 404])

    request_body = {
        'mappings': {
            'properties': {
               'geo_location': {'type': 'geo_point'},
            }}
    }

    client.indices.create(index=index, body=request_body)
    logger.info("Finished Indexing!")
Exemplo n.º 8
0
import os

ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
os.environ['ROOT_DIR'] = ROOT_DIR

from upload_to_elasticsearch.elastic import run_elastic_upload
from shared.LoggerFactory import LoggerFactory
from data_management.DataManager import DataManager

LoggerFactory.get_elastic_logger().info("running elastic upload")
# starts the process of selecting the files to upload to elastic search
DataManager.run_compose_upload_process()
# execute the upload to elastic search
run_elastic_upload()
class LatLonEnhancer:
    """Class handling the enhancement of posts by adding geo data."""

    logger = LoggerFactory.get_enhancement_logger()
    dict_file = os.path.join(os.getenv('ROOT_DIR'), 'data_enhancement',
                             'enhancement_location', 'geocoder_lat_lon.csv')
    lat_lon_dict = {}

    def __init__(self):
        """Initializes the enhancer."""
        self.__setup()
        self.geo_locator = Nominatim(user_agent="einander-helfen.org")
        self.__load_local_storage()

    def __setup(self):
        """Checks if the local storage file exists and creates it if it is missing"""
        LatLonEnhancer.logger.debug("__setup()")

        if not os.path.exists(self.dict_file):
            LatLonEnhancer.logger.warn(
                f"Create missing geocoder_lat_lon.csv as {self.dict_file}")
            open(self.dict_file, "x", encoding='utf-8')

    def enhance(self, post):
        """Adds latitude and longitude to a given post, if both are missing. Returns the enhanced post."""
        LatLonEnhancer.logger.debug("enhance()")

        # If object has lat lon: return object
        if None is post['geo_location']:

            request_string = LatLonEnhancer.get_api_request_string(post)

            lat_lon = self.__check_local_storage(request_string)

            if lat_lon is None:
                LatLonEnhancer.logger.info(f"enhancing lat lon for {post}")
                lat_lon = self.__handle_api_requests(request_string)
                if lat_lon:
                    self.__add_new_entry(request_string, lat_lon)

            post['geo_location'] = lat_lon
            post['post_struct']['geo_location'] = lat_lon

    def __check_local_storage(self, request_string):
        """Checks if local storage contains a result for the query. If it does, the geo_location object is returned.
           Returns None if local storage doesn't contain a result for the request"""
        LatLonEnhancer.logger.debug("__check_local_storage()")

        if request_string in self.lat_lon_dict:
            return self.lat_lon_dict[request_string]
        return None

    def __load_local_storage(self):
        """Reads local storage file (.csv) into class attribute"""
        LatLonEnhancer.logger.debug("__load_local_storage()")

        # Initialize the file, if it is not
        with open(self.dict_file, 'a', newline='',
                  encoding='utf-8') as csvfile:
            if not csvfile.tell():
                fieldnames = ['request', 'lat', 'lon']
                writer = csv.DictWriter(csvfile, fieldnames)
                writer.writeheader()

        # Read the file
        with open(self.dict_file, newline='', encoding='utf-8') as csvfile:
            geocoder_lat_lon = csv.reader(csvfile, delimiter=',')
            for row in geocoder_lat_lon:
                if row and row[0] != 'request':
                    # row[0]: request string, row[1]: lat, row[2]: lon
                    self.lat_lon_dict[row[0]] = {
                        'lat': float(row[1]),
                        'lon': float(row[2])
                    }

    def __add_new_entry(self, request_string, geo_location):
        """Adds new entry to local storage"""
        LatLonEnhancer.logger.debug("__add_new_entry()")

        self.lat_lon_dict[request_string] = geo_location
        with open(self.dict_file, 'a', newline='',
                  encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([
                request_string,
                str(geo_location['lat']),
                str(geo_location['lon'])
            ])

        LatLonEnhancer.logger.info(
            f'Added geo location of \'{request_string}\' to the dictionary')

    def __handle_api_requests(self, request_string):
        """Executes the API request"""
        LatLonEnhancer.logger.debug(f"__handle_api_requests({request_string})")

        if request_string != "":
            location = self.geo_locator.geocode(request_string)
            time.sleep(1)

            if location:
                geo_location = {
                    'lat': location.latitude,
                    'lon': location.longitude
                }
                return geo_location
        return None

    @staticmethod
    def get_api_request_string(post):
        """Build the API request string"""
        LatLonEnhancer.logger.debug("get_api_request_string()")

        struct_data = post['post_struct']
        request_string = ""

        # Try to build request string from:
        # 1. structured location 2. structured address of contact 3. structured address of organisation
        for field in ['location', 'contact', 'organization']:
            if len(request_string
                   ) < 1 and field in struct_data and struct_data[
                       field] and len(struct_data[field]) > 0:
                request_string += struct_data[field]['street'] + ' ' if 'street' in struct_data[field] and \
                                                                        struct_data[field]['street'] else ''
                request_string += struct_data[field]['zipcode'] + ' ' if 'zipcode' in struct_data[field] and \
                                                                         struct_data[field]['zipcode'] else ''
                request_string += struct_data[field]['city'] + ' ' if 'city' in struct_data[field] and \
                                                                      struct_data[field]['city'] else ''
                request_string += struct_data[field]['country'] + ' ' if 'country' in struct_data[field] and \
                                                                         struct_data[field]['country'] else ''
                request_string = request_string.strip()

        return request_string