import codecs import os from subprocess import call from PIL import Image import av from ocd_backend.exceptions import UnsupportedContentType from ocd_backend.log import get_source_logger from ocd_backend.settings import TEMP_DIR_PATH log = get_source_logger('enricher_task') class MediaEnrichmentException(Exception): pass class BaseMediaEnrichmentTask(object): """The base class that media enrichment tasks should inherit.""" #: The content types that the tasks is able to process content_types = [] def __init__(self, media_item, content_type, file_object, enrichment_data, object_id, combined_index_doc, doc): if self.content_types is not '*' and content_type.lower() not\ in self.content_types: raise UnsupportedContentType() return self.enrich_item(media_item, content_type, file_object,
from ocd_backend import celery_app from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('persons') @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True) def allmanak_person_item(self, content_type, raw_item, entity, source_item, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': self.source_definition['key'], 'supplier': 'allmanak', 'collection': 'person', } person = Person(original_item['systemid'], **source_defaults) person.canonical_id = original_item['systemid'] person.has_organization_name = TopLevelOrganization(self.source_definition['allmanak_id'], source=self.source_definition['key'], supplier='allmanak', collection=self.source_definition['source_type']) person.name = original_item['naam'] if 'Dhr.' in original_item['naam']: person.gender = 'Man' elif 'Mw.' in original_item['naam']: person.gender = 'Vrouw'
from ocd_backend import celery_app from ocd_backend import settings from ocd_backend.es import elasticsearch as es from ocd_backend.log import get_source_logger log = get_source_logger('ocd_backend.tasks') class BaseCleanup(celery_app.Task): ignore_result = True def run(self, *args, **kwargs): run_identifier = kwargs.get('run_identifier') run_identifier_chains = '{}_chains'.format(run_identifier) self._remove_chain(run_identifier_chains, kwargs.get('chain_id')) if self.backend.get_set_cardinality( run_identifier_chains) < 1 and self.backend.get( run_identifier) == 'done': self.backend.remove(run_identifier_chains) self.run_finished(**kwargs) else: # If the extractor is still running, extend the lifetime of the # identifier self.backend.update_ttl( run_identifier, settings.CELERY_CONFIG.get('CELERY_TASK_RESULT_EXPIRES', 1800)) def _remove_chain(self, run_identifier, value): self.backend.remove_value_from_set(run_identifier, value)
from collections import namedtuple from copy import deepcopy from ocd_backend import settings from ocd_backend.app import celery_app from ocd_backend.log import get_source_logger from ocd_backend.transformers import BaseTransformer from ocd_backend.utils.misc import load_object from ocd_backend.models.model import PostgresDatabase from ocd_backend.models.serializers import PostgresSerializer log = get_source_logger('database_transformer') RelationPlaceholder = namedtuple('RelationPlaceholder', 'ori_id') class DatabaseTransformer(BaseTransformer): """ Base class for specific database transformers to inherit from. """ def __init__(self, *args, **kwargs): super(DatabaseTransformer, self).__init__(*args, **kwargs) self.database = PostgresDatabase(serializer=PostgresSerializer) self.created_models = dict() self.processed_subresources = set() @staticmethod def get_model_class(properties): """
from ocd_backend import celery_app from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('notubiz_committee') @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception, ), retry_backoff=True) def committee_item(self, content_type, raw_item, entity, source_item, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': self.source_definition['key'], 'supplier': 'notubiz', 'collection': 'committee', } committee = Organization(original_item['id'], **source_defaults) committee.canonical_iri = entity committee.has_organization_name = TopLevelOrganization( self.source_definition['allmanak_id'], source=self.source_definition['key'], supplier='allmanak', collection=self.source_definition['source_type'])
import re import iso8601 from ocd_backend import celery_app from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('ibabs_meeting') @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True) def meeting_item(self, content_type, raw_item, entity, source_item, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': self.source_definition['key'], 'supplier': 'ibabs', 'collection': 'meeting', } # Sometimes the meeting is contained in a sub-dictionary called 'Meeting' if 'Meeting' in original_item: meeting = original_item['Meeting'] else: meeting = original_item item = Meeting(meeting['Id'], **source_defaults) item.canonical_id = entity
from datetime import datetime from ocd_backend.log import get_source_logger from ocd_backend.items import BaseItem log = get_source_logger('loader') class ArchiefAlkmaarBaseItem(BaseItem): namespaces = { 'oai': 'http://www.openarchives.org/OAI/2.0/', 'dc': 'http://purl.org/dc/elements/1.1/', 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'xml': 'http://www.w3.org/XML/1998/namespace', 'europeana': 'http://www.europeana.eu/schemas/ese/', 'dcterms': 'http://purl.org/dc/terms/' } media_mime_types = { 'png': 'image/png', 'jpg': 'image/jpg', } def _get_text_or_none(self, xpath_expression): node = self.original_item.find( xpath_expression, namespaces=self.namespaces) if node is not None and node.text is not None: return self.cleanup_xml_inner(node) return None def get_original_object_id(self):
from urlparse import urljoin from lxml import etree from ocd_backend.items import BaseItem from ocd_backend.models import * from ocd_backend.models.model import Relationship from ocd_backend.utils.http import HttpRequestMixin from ocd_backend.log import get_source_logger log = get_source_logger('persons') class AlmanakPersonItem(HttpRequestMixin, BaseItem): def get_rights(self): return u'undefined' def get_collection(self): return unicode(self.source_definition['index_name']) def get_object_model(self): source_defaults = { 'source': 'almanak', 'source_id_key': 'identifier', 'organization': self.source_definition['key'], } request_url = u'https://almanak.overheid.nl%s' % ( unicode(self.original_item['url']),) r = self.http_session.get(request_url, verify=False) r.raise_for_status()
from ocd_backend import settings from ocd_backend.app import celery_app from ocd_backend.log import get_source_logger from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.utils.misc import deep_get, compare_insensitive log = get_source_logger('ggm_meeting') @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=settings.AUTORETRY_EXCEPTIONS, retry_backoff=True) def meeting_item(self, content_type, raw_item, canonical_iri, cached_path, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': 'tweedekamer', 'supplier': 'gegevensmagazijn', 'canonical_iri': canonical_iri, 'cached_path': cached_path, } meeting = Meeting(original_item['Id'], collection='meeting', **source_defaults) meeting.name = original_item.get('Onderwerp') meeting.start_date = original_item.get('Aanvangstijd')
import simplejson as json from ocd_backend.app import celery_app from ocd_backend.extractors import BaseExtractor from ocd_backend.log import get_source_logger from ocd_backend.utils.http import GCSCachingMixin log = get_source_logger('ggm') ggm_base_url = 'https://gegevensmagazijn.tweedekamer.nl/' class GGMBaseExtractor(BaseExtractor, GCSCachingMixin): bucket_name = 'ggm' request_url = None def __init__(self, source_definition): super(GGMBaseExtractor, self).__init__(source_definition=source_definition) def run(self): assert self.request_url skip = celery_app.backend.get(self.request_url) if not skip: skip = 0 while True: full_url = '{}{}&$skip={}'.format(ggm_base_url, self.request_url, skip) _, _, odata_substring = full_url.rpartition(ggm_base_url) if self.exists(odata_substring): resource = self.download_cache(odata_substring)
from ocd_backend import celery_app from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('organizations') def transform_contact_details(data): """ Takes a dictionary of contact details and flattens every entry to {key: {label: label, value: value} . """ transformed_data = {} for key, value in data.items(): if 'label' in value: transformed_data[key] = value else: for key2, value2 in value.items(): transformed_data['%s_%s' % (key, key2)] = {'label': key2, 'value': value2} return transformed_data @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True) def municipality_organization_item(self, content_type, raw_item, entity, source_item, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': self.source_definition['key'],
from datetime import datetime from ocd_backend.log import get_source_logger from ocd_backend.items import BaseItem log = get_source_logger('item') class A2AItem(BaseItem): namespaces = { 'oai': 'http://www.openarchives.org/OAI/2.0/', 'xml': 'http://www.w3.org/XML/1998/namespace', 'a2a': 'http://Mindbus.nl/A2A' } def _get_node_or_none(self, xpath_expression, node=None): """ Returns the requested node based on the xpath expression. Returns None if the node did not exist. """ if node is None: node = self.original_item return node.find(xpath_expression, namespaces=self.namespaces) def _get_text_or_none(self, xpath_expression, start_node=None): """ Returns the text node(s) in the node requested based on the xpath expression. Returns None if no text nodes could be found. Optionally you can specify a start_node for the expression. """ if start_node is None:
from tempfile import NamedTemporaryFile import requests import urllib3 from google.cloud import storage, exceptions from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from ocd_backend.exceptions import InvalidFile, ItemAlreadyProcessed from ocd_backend.log import get_source_logger from ocd_backend.settings import TEMP_DIR_PATH from ocd_backend.settings import USER_AGENT, DATA_DIR_PATH from ocd_backend.utils.misc import localize_datetime, datetime_to_unixstamp, \ str_to_datetime log = get_source_logger('http') class CustomRetry(Retry): """A subclass of the Retry class but with extra logging""" def increment(self, method=None, url=None, response=None, error=None, _pool=None, _stacktrace=None): res = super(CustomRetry, self).increment(method, url, response, error, _pool, _stacktrace) log.info("Retrying url: %s" % url) return res class HttpRequestMixin(object): """A mixin that can be used by extractors that use HTTP as a method
import re from ocd_backend import celery_app from ocd_backend.models.database import Neo4jDatabase from ocd_backend.models.definitions import Mapping, Prov, Ori from ocd_backend.models.exceptions import MissingProperty, ValidationError, \ QueryResultError from ocd_backend.models.properties import PropertyBase, Property, \ StringProperty, IntegerProperty, Relation from ocd_backend.models.serializers import Neo4jSerializer from ocd_backend.models.misc import Namespace, Uri from ocd_backend.utils.misc import iterate, doc_type from ocd_backend.log import get_source_logger from ocd_backend.utils.misc import slugify logger = get_source_logger('model') class ModelMetaclass(type): database_class = Neo4jDatabase serializer_class = Neo4jSerializer def __new__(mcs, name, bases, attrs): # Collect fields from current class. definitions = dict() for key, value in list(attrs.items()): if isinstance(value, PropertyBase): definitions[key] = value attrs.pop(key) if len(bases) > 1 and not issubclass(bases[0], Namespace):
import os from ocd_backend.enrichers.media_enricher import MediaEnricher from ocd_backend.log import get_source_logger from ocd_backend.settings import DATA_DIR_PATH from ocd_backend.utils.misc import get_sha1_hash from ocd_backend.utils.http import LocalCachingMixin, GCSCachingMixin log = get_source_logger('enricher') class LocalStaticMediaEnricher(MediaEnricher, LocalCachingMixin): pass class GCSStaticMediaEnricher(MediaEnricher, GCSCachingMixin): bucket_name = 'ori-static'
import re import iso8601 from ocd_backend.items import BaseItem from ocd_backend.log import get_source_logger from ocd_backend.models import * log = get_source_logger('item') class IBabsMeetingItem(BaseItem): def get_rights(self): return u'undefined' def get_collection(self): return unicode(self.source_definition['index_name']) def get_object_model(self): source_defaults = { 'source': 'ibabs', 'source_id_key': 'identifier', 'organization': self.source_definition['index_name'], } meeting = self.original_item if 'MeetingId' not in self.original_item: item = Meeting(self.original_item['Id'], **source_defaults) item.name = meeting['Meetingtype'] item.chair = meeting['Chairman'] item.location = meeting['Location'].strip()
import urllib import requests from ocd_backend.app import celery_app from ocd_backend.enrichers import BaseEnricher from ocd_backend.exceptions import SkipEnrichment from ocd_backend.log import get_source_logger from ocd_backend.settings import RESOLVER_BASE_URL, AUTORETRY_EXCEPTIONS from ocd_backend.utils.http import GCSCachingMixin from ocd_backend.utils.misc import strip_scheme from tasks.image_metadata import ImageMetadata log = get_source_logger('enricher') class MediaEnricher(BaseEnricher, GCSCachingMixin): """An enricher that is responsible for enriching external media (images, audio, video, etc.) Media items are fetched from the source and then passed on to a set of registered tasks that are responsible for the analysis. """ #: The registry of available sub-tasks that are responsible for the #: analysis of media items. bucket_name = 'ori-static' available_tasks = { 'image_metadata': ImageMetadata, }
import iso8601 from ocd_backend import celery_app from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('goapi_meeting') class GOAPITransformer(BaseTransformer): def get_current_permalink(self, original_item): api_version = self.source_definition.get('api_version', 'v1') base_url = '%s/%s' % ( self.source_definition['base_url'], api_version,) return u'%s/meetings/%i' % (base_url, original_item[u'id'],) def get_documents_as_media_urls(self, original_item): current_permalink = self.get_current_permalink(original_item) output = [] for document in original_item.get('documents', []): # sleep(1) url = u"%s/documents/%s" % (current_permalink, document['id']) output.append({ 'url': url, 'note': document[u'filename']}) return output
from datetime import datetime from ocd_backend import celery_app from ocd_backend.log import get_source_logger from ocd_backend.mixins import OCDBackendTaskSuccessMixin, OCDBackendTaskFailureMixin from ocd_backend.utils.misc import iterate log = get_source_logger('loader') class BaseLoader(OCDBackendTaskSuccessMixin, OCDBackendTaskFailureMixin, celery_app.Task): """The base class that other loaders should inherit.""" def start(self, *args, **kwargs): """Start loading of a single item. This method is called by the transformer and expects args to contain the output of the transformer as a tuple. Kwargs should contain the ``source_definition`` dict. :returns: the output of :py:meth:`~BaseTransformer.transform_item` """ self.source_definition = kwargs['source_definition'] for _, item in iterate(args): self.post_processing(item) self.load_item(item) def load_item(self, doc): raise NotImplementedError
from ocd_backend import celery_app from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('notubiz_meeting') @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception, ), retry_backoff=True) def meeting_item(self, content_type, raw_item, entity, source_item, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': self.source_definition['key'], 'supplier': 'notubiz', 'collection': 'meeting', } event = Meeting(original_item['id'], **source_defaults) event.canonical_iri = entity event.has_organization_name = TopLevelOrganization( self.source_definition['allmanak_id'], source=self.source_definition['key'], supplier='allmanak', collection=self.source_definition['source_type']) event.start_date = original_item['plannings'][0]['start_date'] event.end_date = original_item['plannings'][0]['end_date']
from ocd_backend import celery_app from ocd_backend import settings from ocd_backend.es import elasticsearch as es from ocd_backend.log import get_source_logger log = get_source_logger('ocd_backend.tasks') class BaseCleanup(celery_app.Task): ignore_result = True def run(self, *args, **kwargs): run_identifier = kwargs.get('run_identifier') run_identifier_chains = '{}_chains'.format(run_identifier) self._remove_chain(run_identifier_chains, kwargs.get('chain_id')) if self.backend.get_set_cardinality(run_identifier_chains) < 1 and self.backend.get(run_identifier) == 'done': self.backend.remove(run_identifier_chains) self.run_finished(**kwargs) else: # If the extractor is still running, extend the lifetime of the # identifier self.backend.update_ttl(run_identifier, settings.CELERY_CONFIG .get('CELERY_TASK_RESULT_EXPIRES', 1800)) def _remove_chain(self, run_identifier, value): self.backend.remove_value_from_set(run_identifier, value) def run_finished(self, run_identifier, **kwargs):
import re import iso8601 from ocd_backend import celery_app from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('ibabs_report') @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True) def report_item(self, content_type, raw_item, entity, source_item, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': self.source_definition['key'], 'supplier': 'ibabs', 'collection': 'report', } report = CreativeWork(original_item['id'][0], source=self.source_definition['key'], supplier='ibabs', collection='report') report.canonical_id = original_item['id'][0] report.has_organization_name = TopLevelOrganization(self.source_definition['allmanak_id'], source=self.source_definition['key'], supplier='allmanak',
import operator import requests from ocd_backend.enrichers.text_enricher.tasks import BaseEnrichmentTask from ocd_backend.models.definitions import Meeting as MeetingNS, Rdf from ocd_backend.models.misc import Uri from ocd_backend.settings import ORI_CLASSIFIER_HOST, ORI_CLASSIFIER_PORT from ocd_backend.utils.http import HttpRequestMixin from ocd_backend.log import get_source_logger log = get_source_logger('theme_classifier') class ThemeClassifier(BaseEnrichmentTask, HttpRequestMixin): def enrich_item(self, item): if not ORI_CLASSIFIER_HOST or not ORI_CLASSIFIER_PORT: # Skip classifier if no host is specified return ori_classifier_url = 'http://{}:{}/classificeer'.format(ORI_CLASSIFIER_HOST, ORI_CLASSIFIER_PORT) if not hasattr(item, 'text'): return text = item.text if type(item.text) == list: text = ' '.join(text) if not text or len(text) < 76: return
from lxml import etree from ocd_backend.log import get_source_logger from ocd_backend.transformers import BaseTransformer from ocd_backend.utils.misc import load_object, strip_namespaces log = get_source_logger('transformer') class GegevensmagazijnTransformer(BaseTransformer): def run(self, *args, **kwargs): args = args[0] self.source_definition = kwargs['source_definition'] item = self.deserialize_item(*args) return self.transform_item(*args, item=strip_namespaces(item)) def transform_item(self, raw_item_content_type, raw_item, item, class_name=False): if not class_name: class_name = item.xpath("local-name()") if class_name in self.source_definition['mapping']: item_source = self.source_definition['mapping'][class_name] item_class = item_source['item'] else:
from datetime import datetime from hashlib import sha1 from ocd_backend import settings from ocd_backend.app import celery_app from ocd_backend.log import get_source_logger from ocd_backend.models import * from ocd_backend.transformers import BaseTransformer log = get_source_logger('greenvalley') class GreenValleyTransformer(BaseTransformer): def __init__(self, *args, **kwargs): self.classification_mapping = { 'agenda': 'Agenda', 'agendapage': 'Agendapunt', 'bestuurlijkstuk': 'Bestuurlijk stuk', 'notule': 'Verslag', 'ingekomenstuk': 'Ingekomen stuk', 'antwoordstuk': 'Antwoord' # ? } def get_meeting_dates(self, meeting): """Determine meeting start and end dates.""" start_date = None end_date = None if meeting.get(u'bis_vergaderdatum', u'').strip() != u'': start_date = datetime.fromtimestamp(
from copy import deepcopy from datetime import datetime from uuid import uuid4 from celery import chain, group from elasticsearch.exceptions import NotFoundError from ocd_backend import settings from ocd_backend.app import celery_app from ocd_backend.es import elasticsearch as es from ocd_backend.exceptions import ConfigurationError from ocd_backend.log import get_source_logger from ocd_backend.utils.misc import load_object, propagate_chain_get logger = get_source_logger('pipeline') @celery_app.task(autoretry_for=settings.AUTORETRY_EXCEPTIONS, retry_backoff=True) def setup_pipeline(source_definition): logger.debug('[%s] Starting pipeline for source: %s' % (source_definition['key'], source_definition.get('id'))) # index_name is an alias of the current version of the index index_alias = '{prefix}_{index_name}'.format( prefix=source_definition.get('es_prefix', settings.DEFAULT_INDEX_PREFIX), index_name=source_definition.get('index_name', source_definition.get('id')) ) if not es.indices.exists(index_alias): index_name = '{index_alias}_{now}'.format(index_alias=index_alias, now=datetime.utcnow()
import json from ocd_backend import celery_app from ocd_backend import settings from ocd_backend.es import elasticsearch from ocd_backend.exceptions import ConfigurationError from ocd_backend.loaders import BaseLoader from ocd_backend.log import get_source_logger from ocd_backend.models.serializers import JsonLDSerializer from ocd_backend.utils import json_encoder from ocd_backend.utils.misc import get_sha1_hash log = get_source_logger('elasticsearch_loader') class ElasticsearchLoader(BaseLoader): """Indexes items into Elasticsearch. Each URL found in ``media_urls`` is added as a document to the ``RESOLVER_URL_INDEX`` (if it doesn't already exist). """ def start(self, *args, **kwargs): self.index_name = kwargs.get('new_index_name') if not self.index_name: raise ConfigurationError('The name of the index is not provided') return super(ElasticsearchLoader, self).start(*args, **kwargs) def load_item(self, doc): # Recursively index associated models like attachments
from ocd_backend import celery_app from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('goapi_committee') @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception, ), retry_backoff=True) def committee_item(self, content_type, raw_item, entity, source_item, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': self.source_definition['key'], 'supplier': 'gemeenteoplossingen', 'collection': 'committee', } committee = Organization(original_item['id'], **source_defaults) committee.canonical_id = original_item['id'] committee.has_organization_name = TopLevelOrganization( self.source_definition['allmanak_id'], source=self.source_definition['key'], supplier='allmanak', collection=self.source_definition['source_type'])
from confluent_kafka import Producer from pyld import jsonld from ocd_backend import celery_app from ocd_backend import settings from ocd_backend.loaders import BaseLoader from ocd_backend.log import get_source_logger from ocd_backend.models.serializers import JsonLDSerializer log = get_source_logger('delta_loader') class DeltaLoader(BaseLoader): """Serializes a model to N-Quads and then sends it to a Kafka bus.""" config = { 'bootstrap.servers': settings.KAFKA_HOST, 'session.timeout.ms': settings.KAFKA_SESSION_TIMEOUT, } if settings.KAFKA_USERNAME: config['sasl.mechanisms'] = 'PLAIN' config['security.protocol'] = 'SASL_SSL' # config['ssl.ca.location'] = '/usr/local/etc/openssl/cert.pem' config['sasl.username'] = settings.KAFKA_USERNAME config['sasl.password'] = settings.KAFKA_PASSWORD def load_item(self, doc): # Skip this loader if it is disabled in settings if not settings.KAFKA_ENABLED:
from datetime import datetime from uuid import uuid4 from elasticsearch.exceptions import NotFoundError from celery import chain from ocd_backend.es import elasticsearch as es from ocd_backend import settings, celery_app from ocd_backend.log import get_source_logger from ocd_backend.utils.misc import load_object from ocd_backend.exceptions import ConfigurationError logger = get_source_logger('pipeline') def setup_pipeline(source_definition): # index_name is an alias of the current version of the index index_alias = '{prefix}_{index_name}'.format( prefix=settings.DEFAULT_INDEX_PREFIX, index_name=source_definition.get('index_name', source_definition.get('id')) ) if not es.indices.exists(index_alias): index_name = '{index_alias}_{now}'.format(index_alias=index_alias, now=datetime.utcnow() .strftime('%Y%m%d%H%M%S')) es.indices.create(index_name) es.indices.put_alias(name=index_alias, index=index_name)
from datetime import datetime import requests from ocd_backend.enrichers.text_enricher.tasks import BaseEnrichmentTask from ocd_backend.log import get_source_logger from ocd_backend.models.definitions import Geo, NeoGeo from ocd_backend.models.definitions import schema from ocd_backend.models.misc import Uri from ocd_backend.settings import LOCLINKVIS_HOST, LOCLINKVIS_PORT from ocd_backend.utils.http import HttpRequestMixin log = get_source_logger('waaroverheid') class WaarOverheidEnricher(BaseEnrichmentTask, HttpRequestMixin): """WaarOverheid Enricher searches for location data in text sources and returns which districts, neighborhoods and annotations were mentioned.""" loclinkvis_url = None def enrich_item(self, item): if not isinstance(item, schema.MediaObject): return if not LOCLINKVIS_HOST or not LOCLINKVIS_PORT: # Skip waaroverheid if no host is specified return self.loclinkvis_url = 'http://{}:{}'.format(LOCLINKVIS_HOST, LOCLINKVIS_PORT) cbs_id = self.source_definition.get('cbs_id')
# -*- coding: utf-8 -*- from sqlalchemy.orm.exc import MultipleResultsFound, NoResultFound from ocd_backend.models.definitions import Mapping, Prov, Ori, Meta from ocd_backend.models.exceptions import MissingProperty from ocd_backend.models.properties import PropertyBase, Property, StringProperty, Relation from ocd_backend.models.serializers import PostgresSerializer from ocd_backend.models.misc import Namespace, Uri from ocd_backend.utils.misc import iterate from ocd_backend.log import get_source_logger from ocd_backend.utils.misc import slugify from ocd_backend.models.postgres_database import PostgresDatabase logger = get_source_logger('model') class ModelMetaclass(type): database_class = PostgresDatabase serializer_class = PostgresSerializer def __new__(mcs, name, bases, attrs): # Collect fields from current class. definitions = dict() for key, value in list(attrs.items()): if isinstance(value, PropertyBase): definitions[key] = value attrs.pop(key) if len(bases) > 1 and not issubclass(bases[0], Namespace): raise ValueError('First argument of a Model subclass'
from datetime import datetime from hashlib import sha1 from pprint import pprint import iso8601 from ocd_backend.items import BaseItem from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('goapi_meeting') class GemeenteOplossingenMeeting(BaseItem): def _get_current_permalink(self): api_version = self.source_definition.get('api_version', 'v1') base_url = '%s/%s' % ( self.source_definition['base_url'], api_version,) return u'%s/meetings/%i' % (base_url, self.original_item[u'id'],) def get_rights(self): return u'undefined' def get_collection(self): return unicode(self.source_definition['index_name']) def _get_documents_as_media_urls(self, documents): current_permalink = self._get_current_permalink() output = []
import requests import urllib3 from google.auth.exceptions import GoogleAuthError from google.cloud import storage, exceptions from requests.adapters import HTTPAdapter from urllib3.util.retry import Retry from ocd_backend.exceptions import InvalidFile, ItemAlreadyProcessed from ocd_backend.log import get_source_logger from ocd_backend.settings import TEMP_DIR_PATH from ocd_backend.settings import USER_AGENT, DATA_DIR_PATH from ocd_backend.utils.misc import localize_datetime, datetime_to_unixstamp, \ str_to_datetime log = get_source_logger('http') class CustomRetry(Retry): """A subclass of the Retry class but with extra logging""" def increment(self, method=None, url=None, response=None, error=None, _pool=None, _stacktrace=None): res = super(CustomRetry, self).increment(method, url, response, error, _pool, _stacktrace) log.info("Retrying url: %s" % url) return res
from datetime import datetime from urlparse import urljoin import requests from ocd_backend import settings from ocd_backend.app import celery_app from ocd_backend.log import get_source_logger from ocd_backend.models import * from ocd_backend.transformers import BaseTransformer from ocd_backend.utils.misc import strip_scheme log = get_source_logger('gedeputeerdestaten') class GedeputeerdeStatenTransformer(BaseTransformer): def __init__(self, *args, **kwargs): self.date_mapping = { 'januari': '01', 'februari': '02', 'maart': '03', 'april': '04', 'mei': '05', 'juni': '06', 'juli': '07', 'augustus': '08', 'september': '09', 'oktober': '10', 'november': '11', 'december': '12', }
import tempfile from urllib2 import HTTPError import magic import pdfparser.poppler as pdf import tika.parser as parser from ocd_backend.log import get_source_logger log = get_source_logger('file_parser') def file_parser(fname, pages=None): if magic.from_file(fname, mime=True) == 'application/pdf': try: result_pages = [] i = 0 d = pdf.Document(fname, quiet=True) for i, p in enumerate(d, start=1): text_array = [] for f in p: for b in f: for l in b: text_array.append(unicode(l.text)) result_pages.append('\n'.join(text_array)) if i >= pages: # break after x pages break log.debug("Processed %i pages (%i max)", i, pages) return result_pages
import re from suds.client import Client from suds.transport.https import HttpTransport from ocd_backend import settings from ocd_backend.extractors import BaseExtractor from ocd_backend.log import get_source_logger from ocd_backend.utils.api import FrontendAPIMixin from ocd_backend.utils.http import HttpRequestMixin from ocd_backend.utils.ibabs import (meeting_to_dict, meeting_type_to_dict, list_report_response_to_dict, list_entry_response_to_dict, votes_to_dict, person_profile_to_dict) log = get_source_logger('extractor') class IBabsBaseExtractor(BaseExtractor): """ A base extractor for the iBabs SOAP service. Instantiates the client and configures the right port tu use. """ def run(self): pass def __init__(self, *args, **kwargs): super(IBabsBaseExtractor, self).__init__(*args, **kwargs) try: ibabs_wsdl = self.source_definition['wsdl']
from lxml import etree from ocd_backend.log import get_source_logger from ocd_backend.transformers import BaseTransformer from ocd_backend.utils.misc import load_object, strip_namespaces log = get_source_logger('transformer') class GegevensmagazijnTransformer(BaseTransformer): def run(self, *args, **kwargs): args = args[0] self.source_definition = kwargs['source_definition'] item = self.deserialize_item(*args) return self.transform_item(*args, item=strip_namespaces(item)) def transform_item(self, raw_item_content_type, raw_item, item, class_name=False): if not class_name: class_name = item.xpath("local-name()") if class_name in self.source_definition['mapping']: item_source = self.source_definition['mapping'][class_name] item_class = item_source['item'] else: log.info('Skipping %s, does not exist in mapping' % class_name) return []
import json from time import sleep from lxml import etree from ocd_backend.extractors import BaseExtractor from ocd_backend.log import get_source_logger from ocd_backend.utils.http import HttpRequestMixin log = get_source_logger('extractor') class GemeenteOplossingenBaseExtractor(BaseExtractor, HttpRequestMixin): """ A base extractor for scraping GemeenteOplossingen websites. This base extractor just configures the base url to use for scraping. """ def run(self): pass def __init__(self, *args, **kwargs): super(GemeenteOplossingenBaseExtractor, self).__init__(*args, **kwargs) self.base_url = self.source_definition['base_url'] def _get_committees(self): """ Gets a list of committees, along with links to upcoming and archived meetings. """
from ocd_backend import celery_app from ocd_backend.transformers import BaseTransformer from ocd_backend.models import * from ocd_backend.log import get_source_logger log = get_source_logger('ibabs_committee') @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=(Exception,), retry_backoff=True) def committee_item(self, content_type, raw_item, entity, source_item, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': self.source_definition['key'], 'supplier': 'ibabs', 'collection': 'committee', } committee = Organization(original_item['Id'], **source_defaults) committee.canonical_id = entity committee.has_organization_name = TopLevelOrganization(self.source_definition['allmanak_id'], source=self.source_definition['key'], supplier='allmanak', collection=self.source_definition['source_type']) committee.name = original_item['Meetingtype'] committee.description = original_item['Abbreviation'] if 'sub' in original_item['Meetingtype']: committee.classification = u'Subcommittee'
from ocd_backend import settings from ocd_backend.app import celery_app from ocd_backend.log import get_source_logger from ocd_backend.models import * from ocd_backend.transformers import BaseTransformer log = get_source_logger('ibabs_person') @celery_app.task(bind=True, base=BaseTransformer, autoretry_for=settings.AUTORETRY_EXCEPTIONS, retry_backoff=True) def person_item(self, content_type, raw_item, canonical_iri, cached_path, **kwargs): original_item = self.deserialize_item(content_type, raw_item) self.source_definition = kwargs['source_definition'] source_defaults = { 'source': self.source_definition['key'], 'supplier': 'ibabs', 'collection': 'person', 'cached_path': cached_path, } person = Person(original_item['UserId'], **source_defaults) person.has_organization_name = TopLevelOrganization( self.source_definition['allmanak_id'], source=self.source_definition['key'], supplier='allmanak', collection=self.source_definition['source_type'])
import tempfile from urllib2 import HTTPError import magic import pdfparser.poppler as pdf import tika.parser as parser from ocd_backend.log import get_source_logger log = get_source_logger('file_parser') def file_parser(fname, pages=None): if magic.from_file(fname, mime=True) == 'application/pdf': try: text_array = [] i = 0 d = pdf.Document(fname) for i, p in enumerate(d, start=1): for f in p: for b in f: for l in b: text_array.append(l.text.encode('UTF-8')) if i >= pages: # break after x pages break log.debug("Processed %i pages (%i max)", i, pages) return '\n'.join(text_array) except: # reraise everything