def make_source_for(src, LOCATIONS): slug = slugify(src['collection']).replace('-', '_') slug_location = slugify(src['location']) if is_facebook(src['file_url']): feed_id = "%s_%s_fb_1" % ( slug, slug_location, ) else: feed_id = "%s_%s_1" % ( slug, slug_location, ) result = { "extractor": "", # depends if feed or not "keep_index_on_update": True, "enrichers": [["ocd_backend.enrichers.AS2TranslationEnricher", {}] # [ # "ocd_backend.enrichers.NEREnricher", # {} # ], # [ # "ocd_backend.enrichers.BinoasEnricher", # {} # ] ], "file_url": '', "index_name": slug, "transformer": "ocd_backend.transformers.BaseTransformer", "loader": "ocd_backend.loaders.AS2Loader", "item": "", # html grabber "cleanup": "ocd_backend.tasks.CleanupElasticsearch", "location": _normalize_location(src['location'], LOCATIONS), "hidden": False, "id": feed_id } if not is_facebook(src['file_url']): additional = get_source_info_from_url(src['file_url']) for k, v in additional.iteritems(): result[k] = v else: result[ "extractor"] = "ocd_backend.extractors.facebook.FacebookExtractor" result["item"] = "ocd_backend.items.facebook.PageItem" result["facebook"] = { 'app_secret': os.environ.get('FACEBOOK_APP_SECRET', None), 'app_id': os.environ.get('FACEBOOK_APP_ID', None), "paging": False, "api_version": "v2.11", "graph_url": "%s/posts" % (get_facebook_path(src['file_url']), ) } for k, v in src.iteritems(): if k != 'file_url': result[k] = v return result
def _get_doc_type(self, doc, doc_type_spec): m = re.match(r'^@([\w_]+)', doc_type_spec) if m is not None: doc_field = m.group(1) return slugify(doc[doc_field], '_') else: return doc_type_spec
def __init__(self, source_id=None, source=None, supplier=None, collection=None, merge_into=None): # Set defaults self.skip_validation = None self.values = dict() if merge_into: if not isinstance(merge_into, tuple) or len(merge_into) != 3: raise ValueError( 'merge_into requires a tuple with 3 elements: (predicate, column, value)' ) self.merge_into = merge_into else: self.merge_into = None # https://argu.co/voc/mapping/<organization>/<source>/<source_id_key>/<source_id> # i.e. https://argu.co/voc/mapping/nl/ggm/vrsnummer/6655476 if source_id: assert source assert supplier assert collection self.had_primary_source = Uri( Mapping, '{}/{}/{}/{}'.format(source, supplier, collection, slugify(source_id)))
def __init__(self, source_id=False, organization=None, source=None, source_id_key=None): # Set defaults #self.uri = None #self.prefix = None self.skip_validation = None # self.verbose_name = None self.values = dict() # https://argu.co/voc/mapping/<organization>/<source>/<source_id_key>/<source_id> # i.e. https://argu.co/voc/mapping/nl/ggm/vrsnummer/6655476 if source_id is not False: assert source_id assert organization assert source assert source_id_key self.had_primary_source = Uri( Mapping, '{}/{}/{}/{}'.format( organization, source, source_id_key, slugify(source_id) ) ) self._source = source
def __init__(self, source_id=None, source=None, supplier=None, collection=None, merge_into=None, cached_path=None, canonical_iri=None): # Set defaults self.skip_validation = None self.values = dict() self.enricher_task = self.enricher_task if merge_into: if not isinstance(merge_into, tuple) or len(merge_into) != 3: raise ValueError( 'merge_into requires a tuple with 3 elements: (predicate, column, value)' ) self.merge_into = merge_into else: self.merge_into = None self.canonical_id = source_id self.cached_path = cached_path try: # if canonical_iri is a lambda function self.canonical_iri = canonical_iri(self) except TypeError: self.canonical_iri = canonical_iri # https://argu.co/voc/mapping/<organization>/<source>/<source_id_key>/<source_id> # i.e. https://argu.co/voc/mapping/nl/ggm/vrsnummer/6655476 if source_id: assert source assert supplier assert collection self.source_iri = Uri( Mapping, '{}/{}/{}/{}'.format(source, supplier, collection, slugify(source_id)))
def get_object_id(self): return slugify(unicode(self.original_item['Meetingtype']).strip())
def convert_party(party): slug = slugify(party['partij']).replace('-', '_') slug_location = 'nederland' # feed_type_defs = { # 'Feed': { # "extractor": "ocd_backend.extractors.feed.FeedExtractor", # "item": "ocd_backend.items.feed.FeedContentFromPageItem", # 'env': { # # } # }, # 'Facebook': { # "extractor": "ocd_backend.extractors.facebook.FacebookExtractor", # "item": "ocd_backend.items.facebook.PageItem", # 'env': { # 'app_secret': os.environ.get('FACEBOOK_APP_SECRET', None), # 'app_id': os.environ.get('FACEBOOK_APP_ID', None), # "paging": False, # "api_version": "v2.11", # "graph_url": "%s/posts" % ( # get_facebook_path(party[feed_type]),) # } # } # } feed_id = "%s_%s_1" % ( slug, slug_location, ) result = { "extractor": "", # depends if feed or not "keep_index_on_update": True, "enrichers": [ # [ # "ocd_backend.enrichers.NEREnricher", # {} # ], # [ # "ocd_backend.enrichers.BinoasEnricher", # {} # ] ], "file_url": party['website'], "index_name": slug, "transformer": "ocd_backend.transformers.BaseTransformer", "collection": party['partij'], "loader": "ocd_backend.loaders.ElasticsearchLoader", "item": "", # html grabber "cleanup": "ocd_backend.tasks.CleanupElasticsearch", "location": 'Nederland', "hidden": False, "id": feed_id } if party['feed'] != '': result['extractor'] = "ocd_backend.extractors.feed.FeedExtractor" result['item'] = "ocd_backend.items.feed.FeedContentFromPageItem" else: result[ 'extractor'] = "ocd_backend.extractors.staticfile.StaticHtmlExtractor" result['item_xpath'] = '' return result
def convert_party(party, feed_type, locations): slug = slugify(party['Partij']).replace('-', '_') slug_location = slugify(party['RegioNaam']).replace('-', '_') feed_type_defs = { 'Feed': { "extractor": "ocd_backend.extractors.feed.FeedExtractor", "item": "ocd_backend.items.feed.FeedContentFromPageItem", 'env': {} }, 'Facebook': { "extractor": "ocd_backend.extractors.facebook.FacebookExtractor", "item": "ocd_backend.items.facebook.PageItem", 'env': { 'app_secret': os.environ.get('FACEBOOK_APP_SECRET', None), 'app_id': os.environ.get('FACEBOOK_APP_ID', None), "paging": False, "api_version": "v2.11", "graph_url": "%s/posts" % (get_facebook_path(party[feed_type]), ) } } } # whoops if feed_type != 'Feed': feed_id = "%s_%s_1" % ( slug, slug_location, ) else: feed_id = "%s_%s_fb_1" % ( slug, slug_location, ) result = { "extractor": feed_type_defs[feed_type]['extractor'], "keep_index_on_update": True, "enrichers": [["ocd_backend.enrichers.NEREnricher", {}], ["ocd_backend.enrichers.BinoasEnricher", {}]], feed_type.lower(): {}, "file_url": party[feed_type], "index_name": slug, "transformer": "ocd_backend.transformers.BaseTransformer", "collection": party['Partij'], "loader": "ocd_backend.loaders.ElasticsearchLoader", "item": feed_type_defs[feed_type]['item'], "cleanup": "ocd_backend.tasks.CleanupElasticsearch", "location": _normalize_location(party['RegioNaam'], locations), "hidden": False, "id": feed_id } for k, v in feed_type_defs[feed_type]['env'].iteritems(): result[feed_type.lower()][k] = v return result
def get_object_id(self): return slugify(unicode(self.original_item["name"]).strip())