示例#1
0
def make_source_for(src, LOCATIONS):
    slug = slugify(src['collection']).replace('-', '_')
    slug_location = slugify(src['location'])

    if is_facebook(src['file_url']):
        feed_id = "%s_%s_fb_1" % (
            slug,
            slug_location,
        )
    else:
        feed_id = "%s_%s_1" % (
            slug,
            slug_location,
        )

    result = {
        "extractor": "",  # depends if feed or not
        "keep_index_on_update": True,
        "enrichers":
        [["ocd_backend.enrichers.AS2TranslationEnricher", {}]
         # [
         #   "ocd_backend.enrichers.NEREnricher",
         #   {}
         # ],
         # [
         #   "ocd_backend.enrichers.BinoasEnricher",
         #   {}
         # ]
         ],
        "file_url": '',
        "index_name": slug,
        "transformer": "ocd_backend.transformers.BaseTransformer",
        "loader": "ocd_backend.loaders.AS2Loader",
        "item": "",  # html grabber
        "cleanup": "ocd_backend.tasks.CleanupElasticsearch",
        "location": _normalize_location(src['location'], LOCATIONS),
        "hidden": False,
        "id": feed_id
    }

    if not is_facebook(src['file_url']):
        additional = get_source_info_from_url(src['file_url'])
        for k, v in additional.iteritems():
            result[k] = v
    else:
        result[
            "extractor"] = "ocd_backend.extractors.facebook.FacebookExtractor"
        result["item"] = "ocd_backend.items.facebook.PageItem"
        result["facebook"] = {
            'app_secret': os.environ.get('FACEBOOK_APP_SECRET', None),
            'app_id': os.environ.get('FACEBOOK_APP_ID', None),
            "paging": False,
            "api_version": "v2.11",
            "graph_url": "%s/posts" % (get_facebook_path(src['file_url']), )
        }

    for k, v in src.iteritems():
        if k != 'file_url':
            result[k] = v
    return result
示例#2
0
 def _get_doc_type(self, doc, doc_type_spec):
     m = re.match(r'^@([\w_]+)', doc_type_spec)
     if m is not None:
         doc_field = m.group(1)
         return slugify(doc[doc_field], '_')
     else:
         return doc_type_spec
示例#3
0
    def __init__(self,
                 source_id=None,
                 source=None,
                 supplier=None,
                 collection=None,
                 merge_into=None):
        # Set defaults
        self.skip_validation = None
        self.values = dict()

        if merge_into:
            if not isinstance(merge_into, tuple) or len(merge_into) != 3:
                raise ValueError(
                    'merge_into requires a tuple with 3 elements: (predicate, column, value)'
                )
            self.merge_into = merge_into
        else:
            self.merge_into = None

        # https://argu.co/voc/mapping/<organization>/<source>/<source_id_key>/<source_id>
        # i.e. https://argu.co/voc/mapping/nl/ggm/vrsnummer/6655476
        if source_id:
            assert source
            assert supplier
            assert collection
            self.had_primary_source = Uri(
                Mapping, '{}/{}/{}/{}'.format(source, supplier, collection,
                                              slugify(source_id)))
示例#4
0
    def __init__(self, source_id=False, organization=None, source=None, source_id_key=None):
        # Set defaults
        #self.uri = None
        #self.prefix = None
        self.skip_validation = None
        # self.verbose_name = None
        self.values = dict()

        # https://argu.co/voc/mapping/<organization>/<source>/<source_id_key>/<source_id>
        # i.e. https://argu.co/voc/mapping/nl/ggm/vrsnummer/6655476
        if source_id is not False:
            assert source_id
            assert organization
            assert source
            assert source_id_key
            self.had_primary_source = Uri(
                Mapping,
                '{}/{}/{}/{}'.format(
                    organization,
                    source,
                    source_id_key,
                    slugify(source_id)
                )
            )
            self._source = source
示例#5
0
    def __init__(self,
                 source_id=None,
                 source=None,
                 supplier=None,
                 collection=None,
                 merge_into=None,
                 cached_path=None,
                 canonical_iri=None):
        # Set defaults
        self.skip_validation = None
        self.values = dict()
        self.enricher_task = self.enricher_task

        if merge_into:
            if not isinstance(merge_into, tuple) or len(merge_into) != 3:
                raise ValueError(
                    'merge_into requires a tuple with 3 elements: (predicate, column, value)'
                )
            self.merge_into = merge_into
        else:
            self.merge_into = None

        self.canonical_id = source_id
        self.cached_path = cached_path

        try:
            # if canonical_iri is a lambda function
            self.canonical_iri = canonical_iri(self)
        except TypeError:
            self.canonical_iri = canonical_iri

        # https://argu.co/voc/mapping/<organization>/<source>/<source_id_key>/<source_id>
        # i.e. https://argu.co/voc/mapping/nl/ggm/vrsnummer/6655476
        if source_id:
            assert source
            assert supplier
            assert collection
            self.source_iri = Uri(
                Mapping, '{}/{}/{}/{}'.format(source, supplier, collection,
                                              slugify(source_id)))
 def get_object_id(self):
     return slugify(unicode(self.original_item['Meetingtype']).strip())
示例#7
0
 def get_object_id(self):
     return slugify(unicode(self.original_item['Meetingtype']).strip())
示例#8
0
def convert_party(party):
    slug = slugify(party['partij']).replace('-', '_')
    slug_location = 'nederland'

    # feed_type_defs = {
    #     'Feed': {
    #         "extractor": "ocd_backend.extractors.feed.FeedExtractor",
    #         "item": "ocd_backend.items.feed.FeedContentFromPageItem",
    #         'env': {
    #
    #         }
    #     },
    #     'Facebook': {
    #         "extractor": "ocd_backend.extractors.facebook.FacebookExtractor",
    #         "item": "ocd_backend.items.facebook.PageItem",
    #         'env': {
    #             'app_secret': os.environ.get('FACEBOOK_APP_SECRET', None),
    #             'app_id': os.environ.get('FACEBOOK_APP_ID', None),
    #             "paging": False,
    #             "api_version": "v2.11",
    #             "graph_url": "%s/posts" % (
    #                 get_facebook_path(party[feed_type]),)
    #         }
    #     }
    # }

    feed_id = "%s_%s_1" % (
        slug,
        slug_location,
    )

    result = {
        "extractor": "",  # depends if feed or not
        "keep_index_on_update": True,
        "enrichers": [
            # [
            #   "ocd_backend.enrichers.NEREnricher",
            #   {}
            # ],
            # [
            #   "ocd_backend.enrichers.BinoasEnricher",
            #   {}
            # ]
        ],
        "file_url": party['website'],
        "index_name": slug,
        "transformer": "ocd_backend.transformers.BaseTransformer",
        "collection": party['partij'],
        "loader": "ocd_backend.loaders.ElasticsearchLoader",
        "item": "",  # html grabber
        "cleanup": "ocd_backend.tasks.CleanupElasticsearch",
        "location": 'Nederland',
        "hidden": False,
        "id": feed_id
    }

    if party['feed'] != '':
        result['extractor'] = "ocd_backend.extractors.feed.FeedExtractor"
        result['item'] = "ocd_backend.items.feed.FeedContentFromPageItem"
    else:
        result[
            'extractor'] = "ocd_backend.extractors.staticfile.StaticHtmlExtractor"
        result['item_xpath'] = ''
    return result
def convert_party(party, feed_type, locations):
    slug = slugify(party['Partij']).replace('-', '_')
    slug_location = slugify(party['RegioNaam']).replace('-', '_')

    feed_type_defs = {
        'Feed': {
            "extractor": "ocd_backend.extractors.feed.FeedExtractor",
            "item": "ocd_backend.items.feed.FeedContentFromPageItem",
            'env': {}
        },
        'Facebook': {
            "extractor": "ocd_backend.extractors.facebook.FacebookExtractor",
            "item": "ocd_backend.items.facebook.PageItem",
            'env': {
                'app_secret': os.environ.get('FACEBOOK_APP_SECRET', None),
                'app_id': os.environ.get('FACEBOOK_APP_ID', None),
                "paging": False,
                "api_version": "v2.11",
                "graph_url":
                "%s/posts" % (get_facebook_path(party[feed_type]), )
            }
        }
    }

    # whoops
    if feed_type != 'Feed':
        feed_id = "%s_%s_1" % (
            slug,
            slug_location,
        )
    else:
        feed_id = "%s_%s_fb_1" % (
            slug,
            slug_location,
        )

    result = {
        "extractor":
        feed_type_defs[feed_type]['extractor'],
        "keep_index_on_update":
        True,
        "enrichers": [["ocd_backend.enrichers.NEREnricher", {}],
                      ["ocd_backend.enrichers.BinoasEnricher", {}]],
        feed_type.lower(): {},
        "file_url":
        party[feed_type],
        "index_name":
        slug,
        "transformer":
        "ocd_backend.transformers.BaseTransformer",
        "collection":
        party['Partij'],
        "loader":
        "ocd_backend.loaders.ElasticsearchLoader",
        "item":
        feed_type_defs[feed_type]['item'],
        "cleanup":
        "ocd_backend.tasks.CleanupElasticsearch",
        "location":
        _normalize_location(party['RegioNaam'], locations),
        "hidden":
        False,
        "id":
        feed_id
    }

    for k, v in feed_type_defs[feed_type]['env'].iteritems():
        result[feed_type.lower()][k] = v
    return result
 def get_object_id(self):
     return slugify(unicode(self.original_item["name"]).strip())