示例#1
0
import codecs
import os
from subprocess import call

from PIL import Image
import av

from ocd_backend.exceptions import UnsupportedContentType
from ocd_backend.log import get_source_logger
from ocd_backend.settings import TEMP_DIR_PATH

log = get_source_logger('enricher_task')


class MediaEnrichmentException(Exception):
    pass


class BaseMediaEnrichmentTask(object):
    """The base class that media enrichment tasks should inherit."""

    #: The content types that the tasks is able to process
    content_types = []

    def __init__(self, media_item, content_type, file_object, enrichment_data,
                 object_id, combined_index_doc, doc):
        if self.content_types is not '*' and content_type.lower() not\
           in self.content_types:
            raise UnsupportedContentType()

        return self.enrich_item(media_item, content_type, file_object,
示例#2
0
from ocd_backend import celery_app
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('persons')


@celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True)
def allmanak_person_item(self, content_type, raw_item, entity, source_item, **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']
    
    source_defaults = {
        'source': self.source_definition['key'],
        'supplier': 'allmanak',
        'collection': 'person',
    }

    person = Person(original_item['systemid'], **source_defaults)
    person.canonical_id = original_item['systemid']
    person.has_organization_name = TopLevelOrganization(self.source_definition['allmanak_id'],
                                                        source=self.source_definition['key'],
                                                        supplier='allmanak',
                                                        collection=self.source_definition['source_type'])

    person.name = original_item['naam']
    if 'Dhr.' in original_item['naam']:
        person.gender = 'Man'
    elif 'Mw.' in original_item['naam']:
        person.gender = 'Vrouw'
示例#3
0
from ocd_backend import celery_app
from ocd_backend import settings
from ocd_backend.es import elasticsearch as es
from ocd_backend.log import get_source_logger

log = get_source_logger('ocd_backend.tasks')


class BaseCleanup(celery_app.Task):
    ignore_result = True

    def run(self, *args, **kwargs):
        run_identifier = kwargs.get('run_identifier')
        run_identifier_chains = '{}_chains'.format(run_identifier)
        self._remove_chain(run_identifier_chains, kwargs.get('chain_id'))

        if self.backend.get_set_cardinality(
                run_identifier_chains) < 1 and self.backend.get(
                    run_identifier) == 'done':
            self.backend.remove(run_identifier_chains)
            self.run_finished(**kwargs)
        else:
            # If the extractor is still running, extend the lifetime of the
            # identifier
            self.backend.update_ttl(
                run_identifier,
                settings.CELERY_CONFIG.get('CELERY_TASK_RESULT_EXPIRES', 1800))

    def _remove_chain(self, run_identifier, value):
        self.backend.remove_value_from_set(run_identifier, value)
示例#4
0
from collections import namedtuple
from copy import deepcopy

from ocd_backend import settings
from ocd_backend.app import celery_app
from ocd_backend.log import get_source_logger
from ocd_backend.transformers import BaseTransformer
from ocd_backend.utils.misc import load_object
from ocd_backend.models.model import PostgresDatabase
from ocd_backend.models.serializers import PostgresSerializer

log = get_source_logger('database_transformer')


RelationPlaceholder = namedtuple('RelationPlaceholder', 'ori_id')


class DatabaseTransformer(BaseTransformer):
    """
    Base class for specific database transformers to inherit from.
    """

    def __init__(self, *args, **kwargs):
        super(DatabaseTransformer, self).__init__(*args, **kwargs)
        self.database = PostgresDatabase(serializer=PostgresSerializer)
        self.created_models = dict()
        self.processed_subresources = set()

    @staticmethod
    def get_model_class(properties):
        """
from ocd_backend import celery_app
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('notubiz_committee')


@celery_app.task(bind=True,
                 base=BaseTransformer,
                 autoretry_for=(Exception, ),
                 retry_backoff=True)
def committee_item(self, content_type, raw_item, entity, source_item,
                   **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']

    source_defaults = {
        'source': self.source_definition['key'],
        'supplier': 'notubiz',
        'collection': 'committee',
    }

    committee = Organization(original_item['id'], **source_defaults)
    committee.canonical_iri = entity
    committee.has_organization_name = TopLevelOrganization(
        self.source_definition['allmanak_id'],
        source=self.source_definition['key'],
        supplier='allmanak',
        collection=self.source_definition['source_type'])
import re

import iso8601

from ocd_backend import celery_app
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('ibabs_meeting')


@celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True)
def meeting_item(self, content_type, raw_item, entity, source_item, **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']

    source_defaults = {
        'source': self.source_definition['key'],
        'supplier': 'ibabs',
        'collection': 'meeting',
    }

    # Sometimes the meeting is contained in a sub-dictionary called 'Meeting'
    if 'Meeting' in original_item:
        meeting = original_item['Meeting']
    else:
        meeting = original_item

    item = Meeting(meeting['Id'], **source_defaults)
    item.canonical_id = entity
from datetime import datetime
from ocd_backend.log import get_source_logger
from ocd_backend.items import BaseItem

log = get_source_logger('loader')


class ArchiefAlkmaarBaseItem(BaseItem):
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'dc': 'http://purl.org/dc/elements/1.1/',
        'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/',
        'xml': 'http://www.w3.org/XML/1998/namespace',
        'europeana': 'http://www.europeana.eu/schemas/ese/',
        'dcterms': 'http://purl.org/dc/terms/'
    }

    media_mime_types = {
        'png': 'image/png',
        'jpg': 'image/jpg',
    }

    def _get_text_or_none(self, xpath_expression):
        node = self.original_item.find(
            xpath_expression, namespaces=self.namespaces)
        if node is not None and node.text is not None:
            return self.cleanup_xml_inner(node)

        return None

    def get_original_object_id(self):
示例#8
0
from urlparse import urljoin

from lxml import etree

from ocd_backend.items import BaseItem
from ocd_backend.models import *
from ocd_backend.models.model import Relationship
from ocd_backend.utils.http import HttpRequestMixin
from ocd_backend.log import get_source_logger

log = get_source_logger('persons')

class AlmanakPersonItem(HttpRequestMixin, BaseItem):
    def get_rights(self):
        return u'undefined'

    def get_collection(self):
        return unicode(self.source_definition['index_name'])

    def get_object_model(self):
        source_defaults = {
            'source': 'almanak',
            'source_id_key': 'identifier',
            'organization': self.source_definition['key'],
        }

        request_url = u'https://almanak.overheid.nl%s' % (
            unicode(self.original_item['url']),)

        r = self.http_session.get(request_url, verify=False)
        r.raise_for_status()
from ocd_backend import settings
from ocd_backend.app import celery_app
from ocd_backend.log import get_source_logger
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.utils.misc import deep_get, compare_insensitive

log = get_source_logger('ggm_meeting')


@celery_app.task(bind=True,
                 base=BaseTransformer,
                 autoretry_for=settings.AUTORETRY_EXCEPTIONS,
                 retry_backoff=True)
def meeting_item(self, content_type, raw_item, canonical_iri, cached_path,
                 **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']

    source_defaults = {
        'source': 'tweedekamer',
        'supplier': 'gegevensmagazijn',
        'canonical_iri': canonical_iri,
        'cached_path': cached_path,
    }

    meeting = Meeting(original_item['Id'],
                      collection='meeting',
                      **source_defaults)
    meeting.name = original_item.get('Onderwerp')
    meeting.start_date = original_item.get('Aanvangstijd')
示例#10
0
import simplejson as json

from ocd_backend.app import celery_app
from ocd_backend.extractors import BaseExtractor
from ocd_backend.log import get_source_logger
from ocd_backend.utils.http import GCSCachingMixin

log = get_source_logger('ggm')
ggm_base_url = 'https://gegevensmagazijn.tweedekamer.nl/'


class GGMBaseExtractor(BaseExtractor, GCSCachingMixin):
    bucket_name = 'ggm'
    request_url = None

    def __init__(self, source_definition):
        super(GGMBaseExtractor, self).__init__(source_definition=source_definition)

    def run(self):
        assert self.request_url

        skip = celery_app.backend.get(self.request_url)
        if not skip:
            skip = 0

        while True:
            full_url = '{}{}&$skip={}'.format(ggm_base_url, self.request_url, skip)
            _, _, odata_substring = full_url.rpartition(ggm_base_url)

            if self.exists(odata_substring):
                resource = self.download_cache(odata_substring)
from ocd_backend import celery_app
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('organizations')


def transform_contact_details(data):
    """
    Takes a dictionary of contact details and flattens every entry to {key: {label: label, value: value} .
    """

    transformed_data = {}
    for key, value in data.items():
        if 'label' in value:
            transformed_data[key] = value
        else:
            for key2, value2 in value.items():
                transformed_data['%s_%s' % (key, key2)] = {'label': key2, 'value': value2}

    return transformed_data


@celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True)
def municipality_organization_item(self, content_type, raw_item, entity, source_item, **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']

    source_defaults = {
        'source': self.source_definition['key'],
示例#12
0
from datetime import datetime

from ocd_backend.log import get_source_logger
from ocd_backend.items import BaseItem

log = get_source_logger('item')


class A2AItem(BaseItem):
    namespaces = {
        'oai': 'http://www.openarchives.org/OAI/2.0/',
        'xml': 'http://www.w3.org/XML/1998/namespace',
        'a2a': 'http://Mindbus.nl/A2A'
    }

    def _get_node_or_none(self, xpath_expression, node=None):
        """
        Returns the requested node based on the xpath expression. Returns None
        if the node did not exist.
        """
        if node is None:
            node = self.original_item
        return node.find(xpath_expression, namespaces=self.namespaces)

    def _get_text_or_none(self, xpath_expression, start_node=None):
        """
        Returns the text node(s) in the node requested based on the xpath
        expression. Returns None if no text nodes could be found. Optionally
        you can specify a start_node for the expression.
        """
        if start_node is None:
示例#13
0
from tempfile import NamedTemporaryFile

import requests
import urllib3
from google.cloud import storage, exceptions
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from ocd_backend.exceptions import InvalidFile, ItemAlreadyProcessed
from ocd_backend.log import get_source_logger
from ocd_backend.settings import TEMP_DIR_PATH
from ocd_backend.settings import USER_AGENT, DATA_DIR_PATH
from ocd_backend.utils.misc import localize_datetime, datetime_to_unixstamp, \
    str_to_datetime

log = get_source_logger('http')


class CustomRetry(Retry):
    """A subclass of the Retry class but with extra logging"""

    def increment(self, method=None, url=None, response=None, error=None,
                  _pool=None, _stacktrace=None):
        res = super(CustomRetry, self).increment(method, url, response,
                                                 error, _pool, _stacktrace)
        log.info("Retrying url: %s" % url)
        return res


class HttpRequestMixin(object):
    """A mixin that can be used by extractors that use HTTP as a method
示例#14
0
import re

from ocd_backend import celery_app
from ocd_backend.models.database import Neo4jDatabase
from ocd_backend.models.definitions import Mapping, Prov, Ori
from ocd_backend.models.exceptions import MissingProperty, ValidationError, \
    QueryResultError
from ocd_backend.models.properties import PropertyBase, Property, \
    StringProperty, IntegerProperty, Relation
from ocd_backend.models.serializers import Neo4jSerializer
from ocd_backend.models.misc import Namespace, Uri
from ocd_backend.utils.misc import iterate, doc_type
from ocd_backend.log import get_source_logger
from ocd_backend.utils.misc import slugify

logger = get_source_logger('model')


class ModelMetaclass(type):
    database_class = Neo4jDatabase
    serializer_class = Neo4jSerializer

    def __new__(mcs, name, bases, attrs):
        # Collect fields from current class.
        definitions = dict()
        for key, value in list(attrs.items()):
            if isinstance(value, PropertyBase):
                definitions[key] = value
                attrs.pop(key)

        if len(bases) > 1 and not issubclass(bases[0], Namespace):
示例#15
0
import os

from ocd_backend.enrichers.media_enricher import MediaEnricher
from ocd_backend.log import get_source_logger
from ocd_backend.settings import DATA_DIR_PATH
from ocd_backend.utils.misc import get_sha1_hash
from ocd_backend.utils.http import LocalCachingMixin, GCSCachingMixin

log = get_source_logger('enricher')


class LocalStaticMediaEnricher(MediaEnricher, LocalCachingMixin):
    pass


class GCSStaticMediaEnricher(MediaEnricher, GCSCachingMixin):
    bucket_name = 'ori-static'
import re

import iso8601

from ocd_backend.items import BaseItem
from ocd_backend.log import get_source_logger
from ocd_backend.models import *

log = get_source_logger('item')


class IBabsMeetingItem(BaseItem):
    def get_rights(self):
        return u'undefined'

    def get_collection(self):
        return unicode(self.source_definition['index_name'])

    def get_object_model(self):
        source_defaults = {
            'source': 'ibabs',
            'source_id_key': 'identifier',
            'organization': self.source_definition['index_name'],
        }

        meeting = self.original_item
        if 'MeetingId' not in self.original_item:
            item = Meeting(self.original_item['Id'], **source_defaults)
            item.name = meeting['Meetingtype']
            item.chair = meeting['Chairman']
            item.location = meeting['Location'].strip()
示例#17
0
import urllib

import requests

from ocd_backend.app import celery_app
from ocd_backend.enrichers import BaseEnricher
from ocd_backend.exceptions import SkipEnrichment
from ocd_backend.log import get_source_logger
from ocd_backend.settings import RESOLVER_BASE_URL, AUTORETRY_EXCEPTIONS
from ocd_backend.utils.http import GCSCachingMixin
from ocd_backend.utils.misc import strip_scheme
from tasks.image_metadata import ImageMetadata

log = get_source_logger('enricher')


class MediaEnricher(BaseEnricher, GCSCachingMixin):
    """An enricher that is responsible for enriching external media
    (images, audio, video, etc.)

    Media items are fetched from the source and then passed on to a
    set of registered tasks that are responsible for the analysis.
    """

    #: The registry of available sub-tasks that are responsible for the
    #: analysis of media items.
    bucket_name = 'ori-static'

    available_tasks = {
        'image_metadata': ImageMetadata,
    }
import iso8601

from ocd_backend import celery_app
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('goapi_meeting')


class GOAPITransformer(BaseTransformer):
    def get_current_permalink(self, original_item):
        api_version = self.source_definition.get('api_version', 'v1')
        base_url = '%s/%s' % (
            self.source_definition['base_url'], api_version,)

        return u'%s/meetings/%i' % (base_url, original_item[u'id'],)

    def get_documents_as_media_urls(self, original_item):
        current_permalink = self.get_current_permalink(original_item)

        output = []
        for document in original_item.get('documents', []):
            # sleep(1)
            url = u"%s/documents/%s" % (current_permalink, document['id'])
            output.append({
                'url': url,
                'note': document[u'filename']})
        return output

示例#19
0
from datetime import datetime

from ocd_backend import celery_app
from ocd_backend.log import get_source_logger
from ocd_backend.mixins import OCDBackendTaskSuccessMixin, OCDBackendTaskFailureMixin
from ocd_backend.utils.misc import iterate

log = get_source_logger('loader')


class BaseLoader(OCDBackendTaskSuccessMixin, OCDBackendTaskFailureMixin,
                 celery_app.Task):
    """The base class that other loaders should inherit."""
    def start(self, *args, **kwargs):
        """Start loading of a single item.

        This method is called by the transformer and expects args to
        contain the output of the transformer as a tuple.
        Kwargs should contain the ``source_definition`` dict.

        :returns: the output of :py:meth:`~BaseTransformer.transform_item`
        """
        self.source_definition = kwargs['source_definition']

        for _, item in iterate(args):
            self.post_processing(item)
            self.load_item(item)

    def load_item(self, doc):
        raise NotImplementedError
from ocd_backend import celery_app
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('notubiz_meeting')


@celery_app.task(bind=True,
                 base=BaseTransformer,
                 autoretry_for=(Exception, ),
                 retry_backoff=True)
def meeting_item(self, content_type, raw_item, entity, source_item, **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']

    source_defaults = {
        'source': self.source_definition['key'],
        'supplier': 'notubiz',
        'collection': 'meeting',
    }

    event = Meeting(original_item['id'], **source_defaults)
    event.canonical_iri = entity
    event.has_organization_name = TopLevelOrganization(
        self.source_definition['allmanak_id'],
        source=self.source_definition['key'],
        supplier='allmanak',
        collection=self.source_definition['source_type'])
    event.start_date = original_item['plannings'][0]['start_date']
    event.end_date = original_item['plannings'][0]['end_date']
示例#21
0
from ocd_backend import celery_app
from ocd_backend import settings
from ocd_backend.es import elasticsearch as es
from ocd_backend.log import get_source_logger


log = get_source_logger('ocd_backend.tasks')


class BaseCleanup(celery_app.Task):
    ignore_result = True

    def run(self, *args, **kwargs):
        run_identifier = kwargs.get('run_identifier')
        run_identifier_chains = '{}_chains'.format(run_identifier)
        self._remove_chain(run_identifier_chains, kwargs.get('chain_id'))

        if self.backend.get_set_cardinality(run_identifier_chains) < 1 and self.backend.get(run_identifier) == 'done':
            self.backend.remove(run_identifier_chains)
            self.run_finished(**kwargs)
        else:
            # If the extractor is still running, extend the lifetime of the
            # identifier
            self.backend.update_ttl(run_identifier, settings.CELERY_CONFIG
                                    .get('CELERY_TASK_RESULT_EXPIRES', 1800))

    def _remove_chain(self, run_identifier, value):
        self.backend.remove_value_from_set(run_identifier, value)

    def run_finished(self, run_identifier, **kwargs):
示例#22
0
import re

import iso8601

from ocd_backend import celery_app
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('ibabs_report')


@celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True)
def report_item(self, content_type, raw_item, entity, source_item, **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']

    source_defaults = {
        'source': self.source_definition['key'],
        'supplier': 'ibabs',
        'collection': 'report',
    }

    report = CreativeWork(original_item['id'][0],
                          source=self.source_definition['key'],
                          supplier='ibabs',
                          collection='report')
    report.canonical_id = original_item['id'][0]
    report.has_organization_name = TopLevelOrganization(self.source_definition['allmanak_id'],
                                                        source=self.source_definition['key'],
                                                        supplier='allmanak',
示例#23
0
import operator
import requests

from ocd_backend.enrichers.text_enricher.tasks import BaseEnrichmentTask
from ocd_backend.models.definitions import Meeting as MeetingNS, Rdf
from ocd_backend.models.misc import Uri
from ocd_backend.settings import ORI_CLASSIFIER_HOST, ORI_CLASSIFIER_PORT
from ocd_backend.utils.http import HttpRequestMixin
from ocd_backend.log import get_source_logger

log = get_source_logger('theme_classifier')


class ThemeClassifier(BaseEnrichmentTask, HttpRequestMixin):
    def enrich_item(self, item):
        if not ORI_CLASSIFIER_HOST or not ORI_CLASSIFIER_PORT:
            # Skip classifier if no host is specified
            return

        ori_classifier_url = 'http://{}:{}/classificeer'.format(ORI_CLASSIFIER_HOST, ORI_CLASSIFIER_PORT)

        if not hasattr(item, 'text'):
            return

        text = item.text
        if type(item.text) == list:
            text = ' '.join(text)

        if not text or len(text) < 76:
            return
示例#24
0
from lxml import etree

from ocd_backend.log import get_source_logger
from ocd_backend.transformers import BaseTransformer
from ocd_backend.utils.misc import load_object, strip_namespaces

log = get_source_logger('transformer')


class GegevensmagazijnTransformer(BaseTransformer):
    def run(self, *args, **kwargs):
        args = args[0]

        self.source_definition = kwargs['source_definition']
        item = self.deserialize_item(*args)

        return self.transform_item(*args, item=strip_namespaces(item))

    def transform_item(self,
                       raw_item_content_type,
                       raw_item,
                       item,
                       class_name=False):

        if not class_name:
            class_name = item.xpath("local-name()")

        if class_name in self.source_definition['mapping']:
            item_source = self.source_definition['mapping'][class_name]
            item_class = item_source['item']
        else:
from datetime import datetime
from hashlib import sha1

from ocd_backend import settings
from ocd_backend.app import celery_app
from ocd_backend.log import get_source_logger
from ocd_backend.models import *
from ocd_backend.transformers import BaseTransformer

log = get_source_logger('greenvalley')


class GreenValleyTransformer(BaseTransformer):
    def __init__(self, *args, **kwargs):
        self.classification_mapping = {
            'agenda': 'Agenda',
            'agendapage': 'Agendapunt',
            'bestuurlijkstuk': 'Bestuurlijk stuk',
            'notule': 'Verslag',
            'ingekomenstuk': 'Ingekomen stuk',
            'antwoordstuk': 'Antwoord'  # ?
        }

    def get_meeting_dates(self, meeting):
        """Determine meeting start and end dates."""

        start_date = None
        end_date = None

        if meeting.get(u'bis_vergaderdatum', u'').strip() != u'':
            start_date = datetime.fromtimestamp(
示例#26
0
from copy import deepcopy
from datetime import datetime
from uuid import uuid4

from celery import chain, group
from elasticsearch.exceptions import NotFoundError

from ocd_backend import settings
from ocd_backend.app import celery_app
from ocd_backend.es import elasticsearch as es
from ocd_backend.exceptions import ConfigurationError
from ocd_backend.log import get_source_logger
from ocd_backend.utils.misc import load_object, propagate_chain_get

logger = get_source_logger('pipeline')


@celery_app.task(autoretry_for=settings.AUTORETRY_EXCEPTIONS, retry_backoff=True)
def setup_pipeline(source_definition):
    logger.debug('[%s] Starting pipeline for source: %s' % (source_definition['key'], source_definition.get('id')))

    # index_name is an alias of the current version of the index
    index_alias = '{prefix}_{index_name}'.format(
        prefix=source_definition.get('es_prefix', settings.DEFAULT_INDEX_PREFIX),
        index_name=source_definition.get('index_name',
                                         source_definition.get('id'))
    )

    if not es.indices.exists(index_alias):
        index_name = '{index_alias}_{now}'.format(index_alias=index_alias,
                                                  now=datetime.utcnow()
import json

from ocd_backend import celery_app
from ocd_backend import settings
from ocd_backend.es import elasticsearch
from ocd_backend.exceptions import ConfigurationError
from ocd_backend.loaders import BaseLoader
from ocd_backend.log import get_source_logger
from ocd_backend.models.serializers import JsonLDSerializer
from ocd_backend.utils import json_encoder
from ocd_backend.utils.misc import get_sha1_hash

log = get_source_logger('elasticsearch_loader')


class ElasticsearchLoader(BaseLoader):
    """Indexes items into Elasticsearch.

    Each URL found in ``media_urls`` is added as a document to the
    ``RESOLVER_URL_INDEX`` (if it doesn't already exist).
    """
    def start(self, *args, **kwargs):
        self.index_name = kwargs.get('new_index_name')

        if not self.index_name:
            raise ConfigurationError('The name of the index is not provided')

        return super(ElasticsearchLoader, self).start(*args, **kwargs)

    def load_item(self, doc):
        # Recursively index associated models like attachments
示例#28
0
from ocd_backend import celery_app
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('goapi_committee')


@celery_app.task(bind=True,
                 base=BaseTransformer,
                 autoretry_for=(Exception, ),
                 retry_backoff=True)
def committee_item(self, content_type, raw_item, entity, source_item,
                   **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']

    source_defaults = {
        'source': self.source_definition['key'],
        'supplier': 'gemeenteoplossingen',
        'collection': 'committee',
    }

    committee = Organization(original_item['id'], **source_defaults)
    committee.canonical_id = original_item['id']
    committee.has_organization_name = TopLevelOrganization(
        self.source_definition['allmanak_id'],
        source=self.source_definition['key'],
        supplier='allmanak',
        collection=self.source_definition['source_type'])
示例#29
0
from confluent_kafka import Producer
from pyld import jsonld

from ocd_backend import celery_app
from ocd_backend import settings
from ocd_backend.loaders import BaseLoader
from ocd_backend.log import get_source_logger
from ocd_backend.models.serializers import JsonLDSerializer

log = get_source_logger('delta_loader')


class DeltaLoader(BaseLoader):
    """Serializes a model to N-Quads and then sends it to a Kafka bus."""

    config = {
        'bootstrap.servers': settings.KAFKA_HOST,
        'session.timeout.ms': settings.KAFKA_SESSION_TIMEOUT,
    }

    if settings.KAFKA_USERNAME:
        config['sasl.mechanisms'] = 'PLAIN'
        config['security.protocol'] = 'SASL_SSL'
        # config['ssl.ca.location'] = '/usr/local/etc/openssl/cert.pem'
        config['sasl.username'] = settings.KAFKA_USERNAME
        config['sasl.password'] = settings.KAFKA_PASSWORD

    def load_item(self, doc):

        # Skip this loader if it is disabled in settings
        if not settings.KAFKA_ENABLED:
示例#30
0
from datetime import datetime
from uuid import uuid4

from elasticsearch.exceptions import NotFoundError
from celery import chain

from ocd_backend.es import elasticsearch as es
from ocd_backend import settings, celery_app
from ocd_backend.log import get_source_logger
from ocd_backend.utils.misc import load_object
from ocd_backend.exceptions import ConfigurationError

logger = get_source_logger('pipeline')


def setup_pipeline(source_definition):
    # index_name is an alias of the current version of the index
    index_alias = '{prefix}_{index_name}'.format(
        prefix=settings.DEFAULT_INDEX_PREFIX,
        index_name=source_definition.get('index_name',
                                         source_definition.get('id'))
    )

    if not es.indices.exists(index_alias):
        index_name = '{index_alias}_{now}'.format(index_alias=index_alias,
                                                  now=datetime.utcnow()
                                                  .strftime('%Y%m%d%H%M%S'))

        es.indices.create(index_name)
        es.indices.put_alias(name=index_alias, index=index_name)
from datetime import datetime

import requests

from ocd_backend.enrichers.text_enricher.tasks import BaseEnrichmentTask
from ocd_backend.log import get_source_logger
from ocd_backend.models.definitions import Geo, NeoGeo
from ocd_backend.models.definitions import schema
from ocd_backend.models.misc import Uri
from ocd_backend.settings import LOCLINKVIS_HOST, LOCLINKVIS_PORT
from ocd_backend.utils.http import HttpRequestMixin

log = get_source_logger('waaroverheid')


class WaarOverheidEnricher(BaseEnrichmentTask, HttpRequestMixin):
    """WaarOverheid Enricher searches for location data in text sources and
    returns which districts, neighborhoods and annotations were mentioned."""
    loclinkvis_url = None

    def enrich_item(self, item):
        if not isinstance(item, schema.MediaObject):
            return

        if not LOCLINKVIS_HOST or not LOCLINKVIS_PORT:
            # Skip waaroverheid if no host is specified
            return

        self.loclinkvis_url = 'http://{}:{}'.format(LOCLINKVIS_HOST, LOCLINKVIS_PORT)

        cbs_id = self.source_definition.get('cbs_id')
示例#32
0
# -*- coding: utf-8 -*-

from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound

from ocd_backend.models.definitions import Mapping, Prov, Ori, Meta
from ocd_backend.models.exceptions import MissingProperty
from ocd_backend.models.properties import PropertyBase, Property, StringProperty, Relation
from ocd_backend.models.serializers import PostgresSerializer
from ocd_backend.models.misc import Namespace, Uri
from ocd_backend.utils.misc import iterate
from ocd_backend.log import get_source_logger
from ocd_backend.utils.misc import slugify
from ocd_backend.models.postgres_database import PostgresDatabase

logger = get_source_logger('model')


class ModelMetaclass(type):
    database_class = PostgresDatabase
    serializer_class = PostgresSerializer

    def __new__(mcs, name, bases, attrs):
        # Collect fields from current class.
        definitions = dict()
        for key, value in list(attrs.items()):
            if isinstance(value, PropertyBase):
                definitions[key] = value
                attrs.pop(key)

        if len(bases) > 1 and not issubclass(bases[0], Namespace):
            raise ValueError('First argument of a Model subclass'
from datetime import datetime
from hashlib import sha1
from pprint import pprint

import iso8601

from ocd_backend.items import BaseItem
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('goapi_meeting')


class GemeenteOplossingenMeeting(BaseItem):
    def _get_current_permalink(self):
        api_version = self.source_definition.get('api_version', 'v1')
        base_url = '%s/%s' % (
            self.source_definition['base_url'], api_version,)

        return u'%s/meetings/%i' % (base_url, self.original_item[u'id'],)

    def get_rights(self):
        return u'undefined'

    def get_collection(self):
        return unicode(self.source_definition['index_name'])

    def _get_documents_as_media_urls(self, documents):
        current_permalink = self._get_current_permalink()

        output = []
示例#34
0
import requests
import urllib3
from google.auth.exceptions import GoogleAuthError
from google.cloud import storage, exceptions
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

from ocd_backend.exceptions import InvalidFile, ItemAlreadyProcessed
from ocd_backend.log import get_source_logger
from ocd_backend.settings import TEMP_DIR_PATH
from ocd_backend.settings import USER_AGENT, DATA_DIR_PATH
from ocd_backend.utils.misc import localize_datetime, datetime_to_unixstamp, \
    str_to_datetime

log = get_source_logger('http')


class CustomRetry(Retry):
    """A subclass of the Retry class but with extra logging"""
    def increment(self,
                  method=None,
                  url=None,
                  response=None,
                  error=None,
                  _pool=None,
                  _stacktrace=None):
        res = super(CustomRetry, self).increment(method, url, response, error,
                                                 _pool, _stacktrace)
        log.info("Retrying url: %s" % url)
        return res
示例#35
0
from datetime import datetime
from urlparse import urljoin

import requests

from ocd_backend import settings
from ocd_backend.app import celery_app
from ocd_backend.log import get_source_logger
from ocd_backend.models import *
from ocd_backend.transformers import BaseTransformer
from ocd_backend.utils.misc import strip_scheme

log = get_source_logger('gedeputeerdestaten')


class GedeputeerdeStatenTransformer(BaseTransformer):
    def __init__(self, *args, **kwargs):
        self.date_mapping = {
            'januari': '01',
            'februari': '02',
            'maart': '03',
            'april': '04',
            'mei': '05',
            'juni': '06',
            'juli': '07',
            'augustus': '08',
            'september': '09',
            'oktober': '10',
            'november': '11',
            'december': '12',
        }
示例#36
0
import tempfile
from urllib2 import HTTPError

import magic
import pdfparser.poppler as pdf
import tika.parser as parser

from ocd_backend.log import get_source_logger

log = get_source_logger('file_parser')


def file_parser(fname, pages=None):
    if magic.from_file(fname, mime=True) == 'application/pdf':
        try:
            result_pages = []
            i = 0
            d = pdf.Document(fname, quiet=True)
            for i, p in enumerate(d, start=1):
                text_array = []
                for f in p:
                    for b in f:
                        for l in b:
                            text_array.append(unicode(l.text))
                result_pages.append('\n'.join(text_array))

                if i >= pages:  # break after x pages
                    break

            log.debug("Processed %i pages (%i max)", i, pages)
            return result_pages
示例#37
0
import re

from suds.client import Client
from suds.transport.https import HttpTransport

from ocd_backend import settings
from ocd_backend.extractors import BaseExtractor
from ocd_backend.log import get_source_logger
from ocd_backend.utils.api import FrontendAPIMixin
from ocd_backend.utils.http import HttpRequestMixin
from ocd_backend.utils.ibabs import (meeting_to_dict, meeting_type_to_dict,
                                     list_report_response_to_dict,
                                     list_entry_response_to_dict,
                                     votes_to_dict, person_profile_to_dict)

log = get_source_logger('extractor')


class IBabsBaseExtractor(BaseExtractor):
    """
    A base extractor for the iBabs SOAP service. Instantiates the client
    and configures the right port tu use.
    """
    def run(self):
        pass

    def __init__(self, *args, **kwargs):
        super(IBabsBaseExtractor, self).__init__(*args, **kwargs)

        try:
            ibabs_wsdl = self.source_definition['wsdl']
示例#38
0
from lxml import etree

from ocd_backend.log import get_source_logger
from ocd_backend.transformers import BaseTransformer
from ocd_backend.utils.misc import load_object, strip_namespaces

log = get_source_logger('transformer')


class GegevensmagazijnTransformer(BaseTransformer):
    def run(self, *args, **kwargs):
        args = args[0]

        self.source_definition = kwargs['source_definition']
        item = self.deserialize_item(*args)

        return self.transform_item(*args, item=strip_namespaces(item))

    def transform_item(self, raw_item_content_type, raw_item, item,
                       class_name=False):

        if not class_name:
            class_name = item.xpath("local-name()")

        if class_name in self.source_definition['mapping']:
            item_source = self.source_definition['mapping'][class_name]
            item_class = item_source['item']
        else:
            log.info('Skipping %s, does not exist in mapping' % class_name)
            return []
示例#39
0
import json
from time import sleep

from lxml import etree

from ocd_backend.extractors import BaseExtractor
from ocd_backend.log import get_source_logger
from ocd_backend.utils.http import HttpRequestMixin

log = get_source_logger('extractor')


class GemeenteOplossingenBaseExtractor(BaseExtractor, HttpRequestMixin):
    """
    A base extractor for scraping GemeenteOplossingen websites. This
    base extractor just configures the base url to use for scraping.
    """

    def run(self):
        pass

    def __init__(self, *args, **kwargs):
        super(GemeenteOplossingenBaseExtractor, self).__init__(*args, **kwargs)

        self.base_url = self.source_definition['base_url']

    def _get_committees(self):
        """
        Gets a list of committees, along with links to upcoming and archived
        meetings.
        """
from ocd_backend import celery_app
from ocd_backend.transformers import BaseTransformer
from ocd_backend.models import *
from ocd_backend.log import get_source_logger

log = get_source_logger('ibabs_committee')


@celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True)
def committee_item(self, content_type, raw_item, entity, source_item, **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']

    source_defaults = {
        'source': self.source_definition['key'],
        'supplier': 'ibabs',
        'collection': 'committee',
    }

    committee = Organization(original_item['Id'], **source_defaults)
    committee.canonical_id = entity
    committee.has_organization_name = TopLevelOrganization(self.source_definition['allmanak_id'],
                                                           source=self.source_definition['key'],
                                                           supplier='allmanak',
                                                           collection=self.source_definition['source_type'])

    committee.name = original_item['Meetingtype']
    committee.description = original_item['Abbreviation']

    if 'sub' in original_item['Meetingtype']:
        committee.classification = u'Subcommittee'
示例#41
0
from ocd_backend import settings
from ocd_backend.app import celery_app
from ocd_backend.log import get_source_logger
from ocd_backend.models import *
from ocd_backend.transformers import BaseTransformer

log = get_source_logger('ibabs_person')


@celery_app.task(bind=True,
                 base=BaseTransformer,
                 autoretry_for=settings.AUTORETRY_EXCEPTIONS,
                 retry_backoff=True)
def person_item(self, content_type, raw_item, canonical_iri, cached_path,
                **kwargs):
    original_item = self.deserialize_item(content_type, raw_item)
    self.source_definition = kwargs['source_definition']

    source_defaults = {
        'source': self.source_definition['key'],
        'supplier': 'ibabs',
        'collection': 'person',
        'cached_path': cached_path,
    }

    person = Person(original_item['UserId'], **source_defaults)
    person.has_organization_name = TopLevelOrganization(
        self.source_definition['allmanak_id'],
        source=self.source_definition['key'],
        supplier='allmanak',
        collection=self.source_definition['source_type'])
import tempfile
from urllib2 import HTTPError

import magic
import pdfparser.poppler as pdf
import tika.parser as parser

from ocd_backend.log import get_source_logger

log = get_source_logger('file_parser')


def file_parser(fname, pages=None):
    if magic.from_file(fname, mime=True) == 'application/pdf':
        try:
            text_array = []
            i = 0
            d = pdf.Document(fname)
            for i, p in enumerate(d, start=1):
                for f in p:
                    for b in f:
                        for l in b:
                            text_array.append(l.text.encode('UTF-8'))

                if i >= pages:  # break after x pages
                    break

            log.debug("Processed %i pages (%i max)", i, pages)
            return '\n'.join(text_array)
        except:
            # reraise everything