def dataset_upserted(dataset): if not dataset.get('related'): return None log.debug(u'Updating dataset post in tumbler "{}".'.format(dataset['name'])) template = templates_lookup.get_template('dataset.mako') body = template.render_unicode( conf = conf, dataset = dataset, ).strip() post_id_str = db.get(str(dataset['id'])) if post_id_str is None: response = requests.post('https://api.tumblr.com/v2/blog/{}/post'.format(conf['tumblr.hostname']), auth = oauth, data = dict( body = body, format = 'html', slug = strings.slugify(dataset['name']), state = 'published', tags = 'opendata,dataviz', title = dataset['title'], type = 'text', ), headers = headers, ) post_id = conv.check(conv.pipe( tumblr_response_to_id, conv.not_none, ))(response.text, state = conv.default_state) db[str(dataset['id'])] = str(post_id) else: response = requests.post('https://api.tumblr.com/v2/blog/{}/post/edit'.format(conf['tumblr.hostname']), auth = oauth, data = dict( body = body, format = 'html', id = int(post_id_str), slug = strings.slugify(dataset['name']), state = 'published', tags = 'opendata,dataviz', title = dataset['title'], type = 'text', ), headers = headers, ) post_id = conv.check(conv.pipe( tumblr_response_to_id, conv.not_none, ))(response.text, state = conv.default_state)
def compute_words(self): self.words = sorted(set(strings.slugify(u'-'.join( fragment for fragment in itertools.chain( ( self._id, texthelpers.textify_markdown(self.notes), self.title, ), itertools.chain(*( [ texthelpers.textify_markdown(related_link.get('description')), related_link.get('title'), ] for related_link in (self.related or []) )), itertools.chain(*( [ texthelpers.textify_markdown(resource.get('description')), resource.get('format'), resource.get('name'), ] for resource in (self.resources or []) )), ( tag['name'] for tag in (self.tags or []) ), ) if fragment is not None )).split(u'-'))) or None
def json_to_python(self): enum = self.enum if enum is None: return super(EnumCol, self).json_to_python # This converters accepts either an item number or an item name. index_by_slug = self.index_by_slug if index_by_slug is None: self.index_by_slug = index_by_slug = dict( (strings.slugify(name), index) for index, name in sorted(enum._vars.iteritems() if enum is not None else ()) ) return conv.pipe( conv.condition( conv.test_isinstance(basestring), conv.pipe( # Convert item name to its index. conv.input_to_slug, conv.test_in(index_by_slug), conv.function(lambda slug: index_by_slug[slug]), ), conv.pipe( # Verify that item index belongs to enumeration. conv.test_isinstance(int), conv.test_in(enum._vars), ), ), conv.default( self._default if self._default is not None and self._default in enum._nums else min(enum._vars.iterkeys()) ), )
def user_extract(req): ctx = contexts.Ctx(req) user = model.get_user(ctx, check = True) if user.email is None: return wsgihelpers.forbidden(ctx) legislation = ctx.node if legislation.is_owner(ctx) and legislation.is_dated: return wsgihelpers.bad_request(ctx, explanation = ctx._(u'This legislation is already dated.')) params = req.GET inputs = { 'date': params.get('date'), } data, errors = conv.struct({ 'date': conv.pipe( conv.french_formatted_str_to_datetime, conv.default(datetime.datetime.utcnow()), ), })(inputs, state = ctx) if errors is not None: return wsgihelpers.bad_request(ctx, explanation = errors) new_legislation = None new_legislation_title = ctx._(u'{} (copy {})').format(legislation.title, user.email) new_legislation_slug = strings.slugify(new_legislation_title) existing_legislations_cursor = model.Legislation.find( dict( slug = new_legislation_slug, ), as_class = collections.OrderedDict, ) if existing_legislations_cursor.count() > 0: for existing_legislation in existing_legislations_cursor: if existing_legislation.is_owner(ctx): return wsgihelpers.redirect(ctx, location = existing_legislation.get_user_url(ctx)) if new_legislation is None: return wsgihelpers.bad_request( ctx, explanation = ctx._(u'A legislation with the same name already exists.'), ) else: new_legislation = model.Legislation( author_id = user._id, datetime_begin = legislation.datetime_begin, datetime_end = legislation.datetime_end, description = ctx._(u'Copy of legislation "{}"').format(legislation.title), title = new_legislation_title, slug = new_legislation_slug, ) response = requests.post( conf['api.urls.legislations'], headers = { 'Content-Type': 'application/json', 'User-Agent': conf['app_name'], }, data = json.dumps(dict(date = data['date'].isoformat(), legislation = legislation.json)), ) new_legislation.json = response.json(object_pairs_hook = collections.OrderedDict).get('dated_legislation') new_legislation.save(safe = True) return wsgihelpers.redirect(ctx, location = new_legislation.get_user_url(ctx))
def id_or_name_or_words_to_instance(value, state = None): if value is None: return value, None if state is None: state = conv.default_state match = uuid_re.match(value) if match is None: self = cls.find_one(dict(name = value), as_class = collections.OrderedDict) else: self = cls.find_one(value, as_class = collections.OrderedDict) if self is None: slug = strings.slugify(value) words = sorted(set(slug.split(u'-'))) instances = list(cls.find( dict( words = {'$all': [ re.compile(u'^{}'.format(re.escape(word))) for word in words ]}, ), as_class = collections.OrderedDict, ).limit(2)) if not instances: return value, state._(u"No organization with ID, name or words: {0}").format(value) if len(instances) > 1: return value, state._(u"Too much organizations with words: {0}").format(u' '.join(words)) self = instances[0] return self, None
def find_category_name(column_name, entity_name): """For a given column, find its category name.""" entity_categories = fields_api_data()['columns_tree'][entity_name]['children'] for entity_category in entity_categories: if column_name in entity_category['children']: return strings.slugify(entity_category['label'], separator = '_') return None
def create_group_or_org(request, is_org): context = contexts.Ctx(request) lang = request.urlvars.get('lang', templates.DEFAULT_LANG) user = auth.get_user_from_request(request) if not user: return wsgihelpers.unauthorized( context) # redirect to login/register ? form = GroupForm(request.POST, i18n=context.translator) if request.method == 'POST' and form.validate(): name = strings.slugify(form.title.data) ckan_api( 'organization_create' if is_org else 'group_create', user, { 'name': name, 'title': form.title.data, 'description': form.description.data, 'image_url': form.image_url.data, }) redirect_url = urls.get_url(lang, 'organization' if is_org else 'group', name) return wsgihelpers.redirect(context, location=redirect_url) back_url = urls.get_url(lang, 'organizations' if is_org else 'groups') return templates.render_site('forms/group-create-form.html', request, is_new=True, is_org=is_org, form=form, back_url=back_url)
def after_show(self, context, pkg_dict): try: cookies = tk.request.cookies except TypeError: # TypeError: No object (name: request) has been registered for this thread. cookies = None if cookies is not None: territory_json_str = cookies.get('territory') if territory_json_str: c = tk.c try: c.territory = json.loads(territory_json_str) except ValueError: pass else: full_name = c.territory.get('full_name') if full_name is not None: c.territory['full_name_slug'] = strings.slugify( full_name) # Add supplier to pkg_dict from ckan.lib.dictization import model_dictize supplier_id = pkg_dict.get('supplier_id') if supplier_id is not None: # Code derivated from model_dictize.package_dictize. model = context['model'] group_rev = model.group_revision_table q = select([group_rev]) \ .where(group_rev.c.id == supplier_id) \ .where(group_rev.c.state == 'active') result = model_dictize._execute_with_revision( q, group_rev, context) organizations = dictization.obj_list_dictize(result, context) pkg_dict['supplier'] = organizations[0] if organizations else None
def json_to_python(self): enum = self.enum if enum is None: return conv.pipe( conv.test_isinstance((basestring, int)), conv.anything_to_int, ) # This converters accepts either an item number or an item name. index_by_slug = self.index_by_slug if index_by_slug is None: self.index_by_slug = index_by_slug = dict( (strings.slugify(name), index) for index, name in sorted(enum._vars.iteritems())) return conv.pipe( conv.test_isinstance((basestring, int)), conv.condition( conv.anything_to_int, conv.pipe( # Verify that item index belongs to enumeration. conv.anything_to_int, conv.test_in(enum._vars), ), conv.pipe( # Convert item name to its index. conv.input_to_slug, conv.test_in(index_by_slug), conv.function(lambda slug: index_by_slug[slug]), ), ), )
def json_to_dated_python(self): enum = self.enum if enum is None: return conv.pipe( conv.test_isinstance((basestring, int)), conv.anything_to_int, ) # This converters accepts either an item number or an item name. index_by_slug = self.index_by_slug if index_by_slug is None: self.index_by_slug = index_by_slug = dict( (strings.slugify(name), index) for index, name in sorted(enum._vars.iteritems()) ) return conv.pipe( conv.test_isinstance((basestring, int)), conv.condition( conv.anything_to_int, conv.pipe( # Verify that item index belongs to enumeration. conv.anything_to_int, conv.test_in(enum._vars), ), conv.pipe( # Convert item name to its index. conv.input_to_slug, conv.test_in(index_by_slug), conv.function(lambda slug: index_by_slug[slug]), ), ), )
def after_show(self, context, pkg_dict): try: cookies = tk.request.cookies except TypeError: # TypeError: No object (name: request) has been registered for this thread. cookies = None if cookies is not None: territory_json_str = cookies.get('territory') if territory_json_str: c = tk.c try: c.territory = json.loads(territory_json_str) except ValueError: pass else: full_name = c.territory.get('full_name') if full_name is not None: c.territory['full_name_slug'] = strings.slugify(full_name) # Add supplier to pkg_dict from ckan.lib.dictization import model_dictize supplier_id = pkg_dict.get('supplier_id') if supplier_id is not None: # Code derivated from model_dictize.package_dictize. model = context['model'] group_rev = model.group_revision_table q = select([group_rev]) \ .where(group_rev.c.id == supplier_id) \ .where(group_rev.c.state == 'active') result = model_dictize._execute_with_revision(q, group_rev, context) organizations = dictization.obj_list_dictize(result, context) pkg_dict['supplier'] = organizations[0] if organizations else None
def send_stats(): datasets_weight = [ weight for weight in ( dataset['weight'] for dataset in metrics['datasets'].itervalues() ) if weight is not None ] datasets_total_weight = sum(datasets_weight) global stats stats = dict( datasets_average_weight = round(datasets_total_weight / len(datasets_weight), 2), datasets_count = len(metrics['datasets']), datasets_median_weight = round(median(datasets_weight), 2), # datasets_median_80_percent_weight = median_80_percent(datasets_weight), datasets_total_weight = round(datasets_total_weight, 2), formats_count = len(set( strings.slugify(resource['format']) for dataset in metrics['datasets'].itervalues() for resource in dataset['resources'] )), organizations_count = len(metrics['organizations']), organizations_public_services_count = sum( 1 for organization in metrics['organizations'].itervalues() if organization['public_service'] ), related_count = sum( dataset['related_count'] for dataset in metrics['datasets'].itervalues() ), resources_count = sum( len(dataset['resources']) for dataset in metrics['datasets'].itervalues() ), ) request = urllib2.Request(urlparse.urljoin(conf['dactylo.site_url'], 'api/1/states'), headers = request_headers) request_data = dict( api_key = conf['dactylo.api_key'], value = stats, ) try: response = urllib2.urlopen(request, json.dumps(request_data)) except urllib2.HTTPError as response: log.error(u'An error occured while setting stats: {}'.format(stats)) response_text = response.read() try: response_dict = json.loads(response_text) except ValueError: log.error(response_text) raise for key, value in response_dict.iteritems(): print '{} = {}'.format(key, value) raise else: assert response.code == 200 conv.check(cow_response_to_value)(response.read(), state = conv.default_state)
def compute_words(self): self.words = sorted( set( strings.slugify(u'-'.join(fragment for fragment in ( self._id, self.email, self.full_name, ) if fragment is not None)).split(u'-'))) or None
def compute_words(self): self.words = sorted( set( strings.slugify(u'-'.join(fragment for fragment in ( self._id, # texthelpers.textify_html(self.description), # self.title, ) if fragment is not None)).split(u'-'))) or None
def setUp(self): # noqa super(TestLegislations, self).setUp() self.ctx = contexts.Ctx() legislation_title = u'Legislation 1' self.legislation = model.Legislation( description = legislation_title, slug = strings.slugify(legislation_title), title = legislation_title, ) self.legislation.save(safe = True)
def compute_words(self): self.words = sorted(set(strings.slugify(u'-'.join( fragment for fragment in ( self._id, texthelpers.textify_markdown(self.description), self.title, ) if fragment is not None )).split(u'-'))) or None
def name_package(self, title): for index in itertools.count(1): differentiator = u'-{}'.format(index) if index > 1 else u'' name = u'{}{}-{}'.format( strings.slugify(title)[:100 - len(self.supplier_abbreviation) - 1 - len(differentiator)].rstrip(u'-'), differentiator, self.supplier_abbreviation, ) if name not in self.package_by_name: return name
def compute_words(self): self.words = sorted(set(strings.slugify(u'-'.join( unicode(fragment) for fragment in ( self._id, self.description, self.title, ) if fragment is not None )).split(u'-'))) or None
def compute_words(self): self.words = sorted(set(strings.slugify(u'-'.join( unicode(fragment) for fragment in ( self._id, self.email, self.full_name, ) if fragment is not None )).split(u'-'))) or None
def __init__(self, admin_name = None, old_supplier_title = None, supplier_abbreviation = None, supplier_title = None, target_headers = None, target_site_url = None): if admin_name is not None: self.admin_name = admin_name if old_supplier_title is not None: assert isinstance(old_supplier_title, unicode) self.old_supplier_title = old_supplier_title old_supplier_name = strings.slugify(old_supplier_title) assert old_supplier_name assert len(old_supplier_name) <= 100 self.old_supplier_name = old_supplier_name assert isinstance(supplier_abbreviation, unicode) assert supplier_abbreviation == strings.slugify(supplier_abbreviation) assert 1 < len(supplier_abbreviation) < 5 self.supplier_abbreviation = supplier_abbreviation assert isinstance(supplier_title, unicode) self.supplier_title = supplier_title supplier_name = strings.slugify(supplier_title) assert supplier_name assert len(supplier_name) <= 100 self.supplier_name = supplier_name assert isinstance(target_headers, dict) assert isinstance(target_headers['Authorization'], basestring) assert isinstance(target_headers['User-Agent'], basestring) self.target_headers = target_headers assert isinstance(target_site_url, unicode) self.target_site_url = target_site_url self.existing_packages_name = set() self.group_by_name = {} self.organization_by_name = {} self.organization_name_by_package_name = {} self.package_by_name = {} self.package_source_by_name = {} self.packages_by_organization_name = {} self.related_by_package_name = {}
def get_cookie(request): if request.cookies.get('territory-infos', '').count('|') == 1: territory_key, _ = request.cookies.get('territory-infos').split('|') territory = fetch(*territory_key.split('/')) if territory_key else {} else: territory = {} return { 'full_name': territory.get('full_name', ''), 'full_name_slug': strings.slugify(territory.get('full_name', '')), 'depcom': territory.get('code', '') }
def build_slug(title, previous=None): base_slug = strings.slugify(title)[:PACKAGE_NAME_MAX_LENGTH] exists_query = DB.query(Package.name) slug_exists = lambda s: exists_query.filter(Package.name == s).count() > 0 if base_slug == previous or not slug_exists(base_slug): return base_slug idx = 0 while True: suffix = '-{0}'.format(idx) slug = ''.join([base_slug[:-len(suffix)], suffix]) if slug == previous or not slug_exists(slug): return slug idx += 1
def duplicate(req): ctx = contexts.Ctx(req) test_case = ctx.node user = model.get_user(ctx, check = True) new_test_case_title = ctx._(u'Copy of {}').format(test_case.title) new_test_case = model.TestCase( author_id = user._id, description = new_test_case_title, title = new_test_case_title, slug = strings.slugify(new_test_case_title), ) new_test_case.save(safe = True) return wsgihelpers.redirect(ctx, location = user.get_user_url(ctx))
def edit_group_or_org(request, is_org): context = contexts.Ctx(request) lang = request.urlvars.get('lang', templates.DEFAULT_LANG) user = auth.get_user_from_request(request) if not user: return wsgihelpers.unauthorized( context) # redirect to login/register ? group_name = request.urlvars.get('name') group = Group.by_name(group_name) if not group: return wsgihelpers.not_found(context) form = GroupForm(request.POST, group, i18n=context.translator) if request.method == 'POST' and form.validate(): name = strings.slugify(form.title.data) extras = [{ 'key': key, 'value': value } for key, value in group.extras.items()] ckan_api( 'organization_update' if is_org else 'group_update', user, { 'id': group.id, 'name': name, 'title': form.title.data, 'description': form.description.data, 'image_url': form.image_url.data, 'extras': extras, 'users': _get_members(group), }) redirect_url = urls.get_url(lang, 'organization' if is_org else 'group', name) return wsgihelpers.redirect(context, location=redirect_url) group_type = 'organization' if is_org else 'group' group_base_url = urls.get_url(lang, group_type) back_url = urls.get_url(lang, group_type, group.name) delete_url = urls.get_url(lang, group_type, 'delete', group.name) return templates.render_site('forms/group-edit-form.html', request, is_org=is_org, form=form, group_base_url=group_base_url, group=group, back_url=back_url, delete_url=delete_url)
def compute_attributes(self): url_name = conv.check(conv.input_to_url_name)(self.name) if url_name is None: if self.url_name is not None: del self.url_name else: self.url_name = url_name self.words = sorted( set( strings.slugify(u'-'.join(fragment for fragment in ( unicode(self._id), self.name, ) if fragment is not None)).split(u'-'))) or None return self
def save(self, *args, **kwargs): if self.upload_at is None: self.upload_at = datetime.datetime.utcnow() # find a unique slug based on filename if self.slug is None: slug = slugify(self.filename) distinguish = 1 while True: proposal = slug if distinguish == 1 else '%s-%d' % (slug, distinguish) if not Projects.find_one({'slug': proposal}): slug = proposal break distinguish += 1 self.slug = slug return super(Projects, self).save(*args, **kwargs)
def convert_element_to_article(self, element, updated): title_url = None for xpath in ( './/h1', './/h2', './/h3', './/h4', './/h5', './/h6', ): heading_elements = element.xpath(xpath) if len(heading_elements) > 0: title = lxml.html.tostring(heading_elements[0], encoding=unicode, method='text').strip() # Remove header from article element. header_element = None for ancestor_element in iter_element_ancestors( heading_elements[0]): if ancestor_element.tag in ('a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup') \ or ancestor_element.tag == 'div' and ancestor_element.get('class') == 'page-header': header_element = ancestor_element if ancestor_element.tag == 'a': url, error = conv.pipe( conv.make_input_to_url(), conv.not_none, )(ancestor_element.get('href'), state=self.ctx) if error is None: title_url = url header_element.getparent().remove(header_element) break else: title = None return dict( element=element, hash=element.get('id') or strings.slugify(title), id=element.get('id'), node=self, title=title, title_url=title_url, updated=get_element_time(element, default=updated), )
def edit(req): ctx = contexts.Ctx(req) user = model.get_user(ctx, check = True) params = req.params inputs = { 'title': params.get('title'), 'description': params.get('description'), } data, errors = conv.struct({ 'title': conv.cleanup_line, 'description': conv.cleanup_line, })(inputs, state = ctx) if errors is not None: return wsgihelpers.bad_request(ctx, explanation = errors) test_case = ctx.node test_case.description = data['description'] test_case.slug = strings.slugify(data['title']) test_case.title = data['title'] test_case.save(safe = True) return wsgihelpers.redirect(ctx, location = user.get_user_url(ctx))
def handle_upload(request, field, user=None): from ckan.controllers import storage if not isinstance(field.data, cgi.FieldStorage): return None filename, ext = splitext(field.data.filename) filename = strings.slugify(filename) filename = ''.join([filename, ext]) filename = '{ts:%Y-%m-%dT%H-%M-%S}/{name}'.format(name=filename, ts=datetime.now()) ofs = storage.get_ofs() ofs.put_stream(STORAGE_BUCKET, filename, field.data.file, { 'filename-original': field.data.filename, 'uploaded-by': user.name if user else '', }) root = conf['home_url'] if root.startswith('//'): root = root.replace('//', 'https://' if conf['https'] else 'http://', 1) path = urls.get_url(None, 'storage/f', filename) return ''.join([root, path])
def convert_element_to_article(self, ctx, element, updated): title_url = None for xpath in ( './/h1', './/h2', './/h3', './/h4', './/h5', './/h6', ): heading_elements = element.xpath(xpath) if len(heading_elements) > 0: title = lxml.html.tostring(heading_elements[0], encoding = unicode, method = 'text').strip() # Remove header from article element. header_element = None for ancestor_element in iter_element_ancestors(heading_elements[0]): if ancestor_element.tag in ('a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup') \ or ancestor_element.tag == 'div' and ancestor_element.get('class') == 'page-header': header_element = ancestor_element if ancestor_element.tag == 'a': url, error = conv.pipe( conv.make_input_to_url(), conv.not_none, )(ancestor_element.get('href'), state = ctx) if error is None: title_url = url header_element.getparent().remove(header_element) break else: title = None return dict( element = element, hash = element.get('id') or strings.slugify(title), id = element.get('id'), node = self, title = title, title_url = title_url, updated = get_element_time(element, default = updated), )
def csv_infos_to_csv_bytes(csv_infos_by_schema_name, state = None): from . import ramdb if csv_infos_by_schema_name is None: return None, None if state is None: state = default_state csv_bytes_by_name = {} for schema_name, csv_infos in csv_infos_by_schema_name.iteritems(): csv_file = StringIO() writer = csv.writer(csv_file, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL) writer.writerow([ (label or u'').encode("utf-8") for label in csv_infos['columns_label'] ]) for row in csv_infos['rows']: writer.writerow([ unicode(cell).encode('utf-8') if cell is not None else None for cell in row ]) csv_filename = '{0}.csv'.format(strings.slugify(ramdb.schema_title_by_name.get(schema_name, schema_name))) csv_bytes_by_name[csv_filename] = csv_file.getvalue() return csv_bytes_by_name or None, None
def handle_upload(request, field, user=None): from ckan.controllers import storage if not isinstance(field.data, cgi.FieldStorage): return None filename, ext = splitext(field.data.filename) filename = strings.slugify(filename) filename = ''.join([filename, ext]) filename = '{ts:%Y-%m-%dT%H-%M-%S}/{name}'.format(name=filename, ts=datetime.now()) ofs = storage.get_ofs() ofs.put_stream( STORAGE_BUCKET, filename, field.data.file, { 'filename-original': field.data.filename, 'uploaded-by': user.name if user else '', }) root = conf['home_url'] if root.startswith('//'): root = root.replace('//', 'https://' if conf['https'] else 'http://', 1) path = urls.get_url(None, 'storage/f', filename) return ''.join([root, path])
def create_group_or_org(request, is_org): context = contexts.Ctx(request) lang = request.urlvars.get('lang', templates.DEFAULT_LANG) user = auth.get_user_from_request(request) if not user: return wsgihelpers.unauthorized(context) # redirect to login/register ? form = GroupForm(request.POST, i18n=context.translator) if request.method == 'POST' and form.validate(): name = strings.slugify(form.title.data) ckan_api('organization_create' if is_org else 'group_create', user, { 'name': name, 'title': form.title.data, 'description': form.description.data, 'image_url': form.image_url.data, }) redirect_url = urls.get_url(lang, 'organization' if is_org else 'group', name) return wsgihelpers.redirect(context, location=redirect_url) back_url = urls.get_url(lang, 'organizations' if is_org else 'groups') return templates.render_site('forms/group-create-form.html', request, is_new=True, is_org=is_org, form=form, back_url=back_url)
def edit_group_or_org(request, is_org): context = contexts.Ctx(request) lang = request.urlvars.get('lang', templates.DEFAULT_LANG) user = auth.get_user_from_request(request) if not user: return wsgihelpers.unauthorized(context) # redirect to login/register ? group_name = request.urlvars.get('name') group = Group.by_name(group_name) if not group: return wsgihelpers.not_found(context) form = GroupForm(request.POST, group, i18n=context.translator) if request.method == 'POST' and form.validate(): name = strings.slugify(form.title.data) extras = [{'key': key, 'value': value} for key, value in group.extras.items()] ckan_api('organization_update' if is_org else 'group_update', user, { 'id': group.id, 'name': name, 'title': form.title.data, 'description': form.description.data, 'image_url': form.image_url.data, 'extras': extras, 'users': _get_members(group), }) redirect_url = urls.get_url(lang, 'organization' if is_org else 'group', name) return wsgihelpers.redirect(context, location=redirect_url) group_type = 'organization' if is_org else 'group' group_base_url = urls.get_url(lang, group_type) back_url = urls.get_url(lang, group_type, group.name) delete_url = urls.get_url(lang, group_type, 'delete', group.name) return templates.render_site('forms/group-edit-form.html', request, is_org=is_org, form=form, group_base_url=group_base_url, group=group, back_url=back_url, delete_url=delete_url)
def login(req): """Authorization request""" ctx = contexts.Ctx(req) assert req.method == 'POST' params = req.POST inputs = dict(assertion=params.get('assertion'), ) data, errors = conv.struct( dict(assertion=conv.pipe( conv.cleanup_line, conv.not_none, ), ), )(inputs, state=ctx) if errors is not None: return wsgihelpers.bad_request( ctx, explanation=ctx._(u'Login Error: {0}').format(errors)) response = requests.post( 'https://verifier.login.persona.org/verify', data=dict( audience=urls.get_full_url(ctx), assertion=data['assertion'], ), verify=True, ) if not response.ok: return wsgihelpers.internal_error( ctx, dump=response.text, explanation=ctx._( u'Error while verifying authentication assertion'), ) verification_data = json.loads(response.content) # Check if the assertion was valid. if verification_data['status'] != 'okay': return wsgihelpers.internal_error( ctx, dump=response.text, explanation=ctx._( u'Error while verifying authentication assertion'), ) user = model.Account.find_one( dict(email=verification_data['email'], ), as_class=collections.OrderedDict, ) if user is None: user = model.Account() user._id = unicode(uuid.uuid4()) user.api_key = unicode(uuid.uuid4()) user.email = verification_data['email'] user.full_name = verification_data['email'] user.slug = strings.slugify(user.full_name) user.compute_attributes() user.save(ctx, safe=True) ctx.user = user session = ctx.session if session is None: ctx.session = session = model.Session() session.synchronizer_token = unicode(uuid.uuid4()) session.token = unicode(uuid.uuid4()) session.expiration = datetime.datetime.utcnow() + datetime.timedelta( hours=4) session.user_id = user._id session.save(ctx, safe=True) if req.cookies.get(conf['cookie']) != session.token: req.response.set_cookie(conf['cookie'], session.token, httponly=True, secure=req.scheme == 'https') return 'Login succeeded.'
def main(): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('config', help='path of configuration file') parser.add_argument( 'csv_file_path', help='path of CSV file containing the groups to use by organization') parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity') global args args = parser.parse_args() logging.basicConfig( level=logging.DEBUG if args.verbose else logging.WARNING, stream=sys.stdout) config_parser = ConfigParser.SafeConfigParser( dict(here=os.path.dirname( os.path.abspath(os.path.normpath(args.config))), )) config_parser.read(args.config) conf = conv.check( conv.pipe( conv.test_isinstance(dict), conv.struct( { 'ckan.api_key': conv.pipe( conv.cleanup_line, conv.not_none, ), 'ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment=True, error_if_path=True, error_if_query=True, full=True), conv.not_none, ), 'user_agent': conv.pipe( conv.cleanup_line, conv.not_none, ), }, default='drop', ), conv.not_none, ))(dict(config_parser.items('Change-datasets-groups-by-organization')), conv.default_state) ckan_headers = { 'Authorization': conf['ckan.api_key'], 'User-Agent': conf['user_agent'], } group_by_name = {} groups_name_by_organization_name = {} organization_by_id = {} organization_by_name = {} with open(args.csv_file_path) as csv_file: csv_reader = csv.reader(csv_file) csv_reader.next() for row in csv_reader: organization_title, group1_title, group2_title = [ cell.decode('utf-8').strip() or None for cell in row ] if organization_title is None or group1_title is None: continue organization_name = strings.slugify(organization_title)[:100] if organization_name not in organization_by_name: request = urllib2.Request(urlparse.urljoin( conf['ckan.site_url'], '/api/3/action/organization_show?id={}'.format( organization_name)), headers=ckan_headers) try: response = urllib2.urlopen(request) except urllib2.HTTPError as response: if response.code == 404: log.warning( u'Skipping missing organization: {}'.format( organization_name)) continue raise else: response_dict = json.loads(response.read()) organization = response_dict['result'] organization_by_id[organization['id']] = organization organization_by_name[organization_name] = organization for group_title in (group1_title, group2_title): if group_title is None: continue group_name = strings.slugify(group_title)[:100] if group_name not in group_by_name: request = urllib2.Request(urlparse.urljoin( conf['ckan.site_url'], '/api/3/action/group_show?id={}'.format(group_name)), headers=ckan_headers) try: response = urllib2.urlopen(request) except urllib2.HTTPError as response: if response.code == 404: log.info(u'Creating group: {}'.format(group_name)) request = urllib2.Request(urlparse.urljoin( conf['ckan.site_url'], '/api/3/action/group_create'), headers=ckan_headers) response = urllib2.urlopen( request, urllib.quote( json.dumps( dict(name=group_name, title=group_title)))) response_dict = json.loads(response.read()) group_by_name[group_name] = response_dict['result'] else: raise else: response_dict = json.loads(response.read()) group_by_name[group_name] = response_dict['result'] groups_name_by_organization_name.setdefault( organization_name, set()).add(group_name) # Retrieve names of packages already existing in CKAN. request = urllib2.Request(urlparse.urljoin(conf['ckan.site_url'], '/api/3/action/package_list'), headers=ckan_headers) response = urllib2.urlopen(request) response_dict = json.loads(response.read()) packages_name = conv.check( conv.pipe( conv.ckan_json_to_name_list, conv.not_none, ))(response_dict['result'], state=conv.default_state) for package_name in packages_name: request = urllib2.Request(urlparse.urljoin( conf['ckan.site_url'], '/api/3/action/package_show?id={}'.format(package_name)), headers=ckan_headers) response = urllib2.urlopen(request) response_dict = json.loads(response.read()) package = conv.check( conv.pipe( conv.make_ckan_json_to_package(drop_none_values=True), conv.not_none, conv.ckan_input_package_to_output_package, ))(response_dict['result'], state=conv.default_state) organization_id = package.get('owner_org') organization = organization_by_id.get(organization_id) if organization is None: continue groups_name = set(group['name'] for group in (package.get('groups') or [])) organization_groups_name = groups_name_by_organization_name[ organization['name']] for group_name in organization_groups_name: if group_name not in groups_name: log.info(u'Adding group {} to package {}'.format( group_name, package['name'])) request = urllib2.Request(urlparse.urljoin( conf['ckan.site_url'], '/api/3/action/member_create'), headers=ckan_headers) response = urllib2.urlopen( request, urllib.quote( json.dumps( dict( capacity='public', id=group_name, object=package['name'], object_type='package', )))) response_dict = json.loads(response.read()) for group_name in groups_name: if group_name not in organization_groups_name: log.info(u'Removing group {} from package {}'.format( group_name, package['name'])) request = urllib2.Request(urlparse.urljoin( conf['ckan.site_url'], '/api/3/action/member_delete'), headers=ckan_headers) response = urllib2.urlopen( request, urllib.quote( json.dumps( dict( id=group_name, object=package['name'], object_type='package', )))) return 0
def login(req): """Authorization request""" ctx = contexts.Ctx(req) assert req.method == 'POST' params = req.POST inputs = dict( assertion = params.get('assertion'), ) data, errors = conv.struct( dict( assertion = conv.pipe( conv.cleanup_line, conv.not_none, ), ), )(inputs, state = ctx) if errors is not None: return wsgihelpers.bad_request(ctx, explanation = ctx._(u'Login Error: {0}').format(errors)) response = requests.post('https://verifier.login.persona.org/verify', data = dict( audience = urls.get_full_url(ctx), assertion = data['assertion'], ), verify = True, ) if not response.ok: return wsgihelpers.internal_error(ctx, dump = response.text, explanation = ctx._(u'Error while verifying authentication assertion'), ) verification_data = json.loads(response.content) # Check if the assertion was valid. if verification_data['status'] != 'okay': return wsgihelpers.internal_error(ctx, dump = response.text, explanation = ctx._(u'Error while verifying authentication assertion'), ) user = model.Account.find_one( dict( email = verification_data['email'], ), as_class = collections.OrderedDict, ) if user is None: user = model.Account() user._id = unicode(uuid.uuid4()) user.api_key = unicode(uuid.uuid4()) user.email = verification_data['email'] user.full_name = verification_data['email'] user.slug = strings.slugify(user.full_name) user.compute_attributes() user.save(ctx, safe = True) ctx.user = user session = ctx.session if session is None: ctx.session = session = model.Session() session.synchronizer_token = unicode(uuid.uuid4()) session.token = unicode(uuid.uuid4()) session.expiration = datetime.datetime.utcnow() + datetime.timedelta(hours = 4) session.user_id = user._id session.save(ctx, safe = True) if req.cookies.get(conf['cookie']) != session.token: req.response.set_cookie(conf['cookie'], session.token, httponly = True, secure = req.scheme == 'https') return 'Login succeeded.'
def main(): parser = argparse.ArgumentParser(description = __doc__) parser.add_argument('config', help = 'path of configuration file') parser.add_argument('-d', '--dry-run', action = 'store_true', help = "simulate harvesting, don't update CKAN repository") parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity') global args args = parser.parse_args() logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) config_parser = ConfigParser.SafeConfigParser(dict( here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))), )) config_parser.read(args.config) conf = conv.check(conv.pipe( conv.test_isinstance(dict), conv.struct( { 'ckan.api_key': conv.pipe( conv.cleanup_line, conv.not_none, ), 'ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True, full = True), conv.not_none, ), 'user_agent': conv.pipe( conv.cleanup_line, conv.not_none, ), }, default = 'drop', ), conv.not_none, ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state) harvester = helpers.Harvester( supplier_abbreviation = u'dp', supplier_title = u'Data Publica', target_headers = { 'Authorization': conf['ckan.api_key'], 'User-Agent': conf['user_agent'], }, target_site_url = conf['ckan.site_url'], ) source_headers = { 'User-Agent': conf['user_agent'], } source_site_url = u'http://www.data-publica.com/etalab/export' if not args.dry_run: harvester.retrieve_target() groups_title = set() # Retrieve packages from source. for page_index in itertools.count(): page_url = urlparse.urljoin(source_site_url, u'?p={}'.format(page_index)) log.info(u"Harvesting page {}".format(page_url)) request = urllib2.Request(page_url.encode('utf-8'), headers = source_headers) response = urllib2.urlopen(request) response_dict = json.loads(response.read()) publications = response_dict['publications'] if not publications: break for publication in publications: publication = conv.check(conv.pipe( validate_publication, conv.not_none, ))(publication, state = conv.default_state) groups_title.update(publication['groups'] or []) if not args.dry_run: groups = [ harvester.upsert_group(dict( title = group_title, )) for group_title in (publication['groups'] or []) ] organization_title = publication['organization'] if not args.dry_run: if organization_title is None: organization = harvester.supplier else: organization = harvester.upsert_organization(dict( title = organization_title, )) package = dict( license_id = license_id_by_name[publication['licenseName']], notes = publication['notes'], resources = [ dict( name = resource['name'], format = format_by_mime_type[resource['mimeType']], url = urlparse.urljoin(source_site_url, resource['url']), ) for resource in (publication['resources'] or []) ], tags = [ dict(name = strings.slugify(tag_name)) for tag_name in sorted(set( tag['name'] for tag in (publication['tags'] or []) )) ], # territorial_coverage = u'Country/FR/FRANCE', title = publication['name'], ) source_url = urlparse.urljoin(source_site_url, publication['url']) helpers.set_extra(package, u'Source', source_url) log.info(u'Harvested package: {}'.format(package['title'])) if not args.dry_run: harvester.add_package(package, organization, publication['name'], source_url, groups = groups) if not args.dry_run: harvester.update_target() log.info(u'Groups: {}'.format(sorted(groups_title))) return 0
def main(): parser = argparse.ArgumentParser(description = __doc__) parser.add_argument('config', help = 'path of configuration file') parser.add_argument('-d', '--dry-run', action = 'store_true', help = "simulate harvesting, don't update CKAN repository") parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity') global args args = parser.parse_args() logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) config_parser = ConfigParser.SafeConfigParser(dict( here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))), )) config_parser.read(args.config) conf = conv.check(conv.pipe( conv.test_isinstance(dict), conv.struct( { 'ckan.api_key': conv.pipe( conv.cleanup_line, conv.not_none, ), 'ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True, full = True), conv.not_none, ), 'user_agent': conv.pipe( conv.cleanup_line, conv.not_none, ), }, default = 'drop', ), conv.not_none, ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state) harvester = helpers.Harvester( supplier_abbreviation = u'prs', supplier_title = u'Mairie de Paris', target_headers = { 'Authorization': conf['ckan.api_key'], 'User-Agent': conf['user_agent'], }, target_site_url = conf['ckan.site_url'], ) source_headers = { 'User-Agent': conf['user_agent'], } source_site_url = u'http://opendata.paris.fr/opendata/rest/ckan/' if not args.dry_run: harvester.retrieve_target() # Retrieve names of packages in source. request = urllib2.Request(urlparse.urljoin(source_site_url, 'api/3/action/package_list'), headers = source_headers) response = urllib2.urlopen(request) response_dict = json.loads(response.read(), encoding = 'cp1252') packages_source_name = conv.check(conv.pipe( conv.ckan_json_to_name_list, conv.not_none, ))(response_dict['result'], state = conv.default_state) # Retrieve packages from source. for package_source_name in packages_source_name: request = urllib2.Request(urlparse.urljoin(source_site_url, u'api/3/action/package_show?id={}'.format( package_source_name)).encode('utf-8'), headers = source_headers) try: response = urllib2.urlopen(request) except urllib2.HTTPError, response: if response.code == 404: log.warning(u'Skipping package {}, because page not found'.format(package_source_name)) continue raise response_dict = json.loads(response.read()) if not response_dict['success']: log.warning(u'Skipping package {}, because {}'.format(package_source_name, response_dict)) continue source_package = conv.check(conv.pipe( before_ckan_json_to_package, conv.make_ckan_json_to_package(drop_none_values = True), conv.not_none, after_ckan_json_to_package, ))(response_dict['result'], state = conv.default_state) if source_package is None: continue package = dict( # frequency = source_package.get('frequency'), license_id = { u'ODbL': u'odc-odbl', }[source_package.get('license_id')], notes = source_package.get('notes'), title = source_package['title'], resources = [ dict( created = resource['created'], format = resource.get('format'), last_modified = resource.get('last_modified'), name = resource.get('name') or u'Fichier.{}'.format(resource.get('format')).strip(u'.'), url = resource['url'], ) for resource in (source_package.get('resources') or []) if resource.get('url') is not None ], tags = [ dict(name = tag_name) for tag_name in sorted(set( strings.slugify(tag['name']) for tag in (source_package.get('tags') or []) )) if tag_name and len(tag_name) > 2 ], # temporal_coverage_from = source_package.get('temporal_coverage_from'), # temporal_coverage_to = source_package.get('temporal_coverage_to'), territorial_coverage = u'CommuneOfFrance/75056/75000 PARIS', url = source_package['url'], ) if not args.dry_run: groups = source_package.get('groups') if groups is not None: groups = [ harvester.upsert_group(dict( # Don't reuse image and description of groups, because Etalab has its own. # description = group.get(u'description'), # image_url = group.get(u'image_url'), title = group_title, )) for group_title in sorted(list(set( group['title'] for group in (groups or []) )) + [u"Territoires et Transports"]) ] log.info(u'Harvested package: {}'.format(package['title'])) if not args.dry_run: harvester.add_package(package, harvester.supplier, source_package['name'], package['url'], groups = groups)
def main(): parser = argparse.ArgumentParser(description = __doc__) parser.add_argument('config', help = 'path of configuration file') parser.add_argument('download_dir', help = 'directory where are stored downloaded HTML pages') parser.add_argument('-d', '--dry-run', action = 'store_true', help = "simulate harvesting, don't update CKAN repository") parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity') global args args = parser.parse_args() logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) config_parser = ConfigParser.SafeConfigParser(dict( here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))), )) config_parser.read(args.config) conf = conv.check(conv.pipe( conv.test_isinstance(dict), conv.struct( { 'ckan.api_key': conv.pipe( conv.cleanup_line, conv.not_none, ), 'ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True, full = True), conv.not_none, ), 'user_agent': conv.pipe( conv.cleanup_line, conv.not_none, ), }, default = 'drop', ), conv.not_none, ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state) harvester = helpers.Harvester( admin_name = u'b-dot-kessler-at-agglo-rennesmetropole-dot-fr', supplier_abbreviation = u'rm', supplier_title = u'Rennes Métropole en accès libre', target_headers = { 'Authorization': conf['ckan.api_key'], 'User-Agent': conf['user_agent'], }, target_site_url = conf['ckan.site_url'], ) # Retrieve paths of HTML pages to convert. data_dir = os.path.join(args.download_dir, 'data') assert os.path.exists(data_dir), "Data directory {0} doesn't exist".format(data_dir) data_file_path_by_number = {} for (dir, directories_name, filenames) in os.walk(data_dir): for directory_name in directories_name[:]: if directory_name.startswith('.'): directories_name.remove(directory_name) for filename in filenames: data_file_path = os.path.join(dir, filename) match = data_filename_re.match(os.path.basename(data_file_path)) assert match is not None, data_file_path data_number = int(match.group('number')) data_file_path_by_number[data_number] = data_file_path if not args.dry_run: harvester.retrieve_target() # Convert source HTML packages to CKAN JSON. for data_number, data_file_path in sorted(data_file_path_by_number.iteritems()): with open(data_file_path) as data_file: try: data_str = data_file.read() data_html = etree.fromstring(data_str, html_parser) html_base_list = data_html.xpath('head/base[@href]') base_url = html_base_list[0].get('href') dataset_html = data_html.xpath('.//div[@class="tx_icsopendatastore_pi1_single"]')[0] assert dataset_html is not None title_str = dataset_html.xpath('.//h3')[0].text.strip() assert title_str publisher_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_publisher separator"]/p[@class="value description"]') publisher_str = publisher_html_list[0].text.strip() or None if publisher_html_list else None contact_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_contact separator"]/p[@class="value description"]') contact_str = contact_html_list[0].text.strip() or None if contact_html_list else None creator_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_creator separator"]/p[@class="value description"]') creator_str = creator_html_list[0].text.strip() or None if creator_html_list else None owner_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_owner separator"]/p[@class="value owner"]') owner_str = owner_html_list[0].text.strip() or None if owner_html_list else None organization_title, author = conv.check(conv.pipe( conv.test_in(organization_titles_by_owner_str), conv.translate(organization_titles_by_owner_str), conv.default((u"Rennes Métropole", None)), ))(owner_str, state = conv.default_state) if not args.dry_run: organization = harvester.upsert_organization(dict( title = organization_title, )) categories_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_categories separator"]/p[@class="value description"]') categories_str = categories_html_list[0].text.strip() or None if categories_html_list else None tags = [ dict(name = tag_name) for tag_name in sorted(set( strings.slugify(category_fragment) for category_str in categories_str.split(u',') for category_fragment in category_str.split(u':') )) ] if not args.dry_run: groups = [ harvester.upsert_group(dict( title = categories_str.split(u',')[0].strip(), )), harvester.upsert_group(dict( title = u'Territoires et Transports', )), ] if categories_str else None release_date_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_releasedate separator"]/p[@class="value description"]') release_date_str = release_date_html_list[0].text if release_date_html_list else None release_date_iso8601_str = conv.check(conv.pipe( french_input_to_date, conv.date_to_iso8601_str, ))(release_date_str, state = conv.default_state) update_date_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_updatedate separator"]/p[@class="value description"]') update_date_str = update_date_html_list[0].text if update_date_html_list else None update_date_iso8601_str = conv.check(conv.pipe( french_input_to_date, conv.date_to_iso8601_str, ))(update_date_str, state = conv.default_state) frequency_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_updatefrequency separator"]/p[@class="value description"]') frequency_str = frequency_html_list[0].text if frequency_html_list else None frequency = conv.check(conv.pipe( conv.cleanup_line, conv.test_in(frequency_translations), conv.translate(frequency_translations), ))(frequency_str, state = conv.default_state) description_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_description separator"]/p[@class="value description"]') description_str = description_html_list[0].text.strip() or None if description_html_list else None technical_data_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_technical_data separator"]' '/p[@class="value technical_data"]') technical_data_str = technical_data_html_list[0].text.strip() or None if technical_data_html_list \ else None license_html_list = dataset_html.xpath( './/div[@class="tx_icsopendatastore_pi1_licence separator"]/p[@class="value owner"]/a') license_str = license_html_list[0].text if license_html_list else None license_id = conv.check(conv.pipe( conv.cleanup_line, conv.test_in(license_id_by_str), conv.translate(license_id_by_str), ))(license_str, state = conv.default_state) resources = [] for resource_html in dataset_html.xpath('.//div[@class="tx_icsopendatastore_pi1_file"]'): resource_url = urlparse.urljoin(base_url, resource_html.xpath('.//a[@href]')[0].get('href')) resource_path = urlparse.urlsplit(resource_url) filename = resource_url.rstrip('/').rsplit(u'/', 1)[-1] or u'Fichier' if not filename or fi resources.append(dict( created = release_date_iso8601_str, format = resource_html.xpath('.//span[@class="coin"]')[0].text.strip() or None, last_modified = update_date_iso8601_str, name = filename, url = resource_url, )) except: print 'An exception occured in file {0}'.format(data_number) raise package = dict( author = author, frequency = frequency, license_id = license_id, maintainer = contact_str, notes = description_str, resources = resources, tags = tags, territorial_coverage = u'IntercommunalityOfFrance/243500139/CA RENNES METROPOLE', title = title_str, url = u'http://www.data.rennes-metropole.fr/les-donnees/catalogue/?tx_icsopendatastore_pi1[uid]={}' .format(data_number), ) helpers.set_extra(package, u'Données techniques', technical_data_str) helpers.set_extra(package, u'Éditeur', publisher_str) helpers.set_extra(package, u'Auteur', creator_str) if not args.dry_run: harvester.add_package(package, organization, package['title'], package['url'], groups = groups) if not args.dry_run: harvester.update_target() return 0
def main(): parser = argparse.ArgumentParser(description = __doc__) parser.add_argument('config', help = 'path of configuration file') parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity') global args args = parser.parse_args() logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout) config_parser = ConfigParser.SafeConfigParser(dict( here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))), )) config_parser.read(args.config) conf = conv.check(conv.pipe( conv.test_isinstance(dict), conv.struct( { 'ckan.api_key': conv.pipe( conv.cleanup_line, conv.not_none, ), 'ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True, full = True), conv.not_none, ), 'opendatasoft.ckan.password': conv.pipe( conv.cleanup_line, conv.not_none, ), 'opendatasoft.ckan.site_url': conv.pipe( conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True, full = True), conv.not_none, ), 'opendatasoft.ckan.username': conv.pipe( conv.cleanup_line, conv.not_none, ), 'user_agent': conv.pipe( conv.cleanup_line, conv.not_none, ), }, default = 'drop', ), conv.not_none, ))(dict(config_parser.items('Etalab-OpenDataSoft-Harvester')), conv.default_state) harvester = helpers.Harvester( supplier_abbreviation = u'ods', supplier_title = u"OpenDataSoft", target_headers = { 'Authorization': conf['ckan.api_key'], 'User-Agent': conf['user_agent'], }, target_site_url = conf['ckan.site_url'], ) source_headers = { 'Authorization': 'Basic {}'.format(base64.encodestring('{}:{}'.format(conf['opendatasoft.ckan.username'], conf['opendatasoft.ckan.password'])).replace('\n', '')), 'User-Agent': conf['user_agent'], } source_site_url = conf['opendatasoft.ckan.site_url'] harvester.retrieve_target() # Retrieve names of packages in source. request = urllib2.Request(urlparse.urljoin(source_site_url, 'api/3/action/package_list'), headers = source_headers) response = urllib2.urlopen(request) response_dict = json.loads(response.read()) packages_source_name = conv.check(conv.pipe( conv.ckan_json_to_name_list, conv.not_none, ))(response_dict['result'], state = conv.default_state) # Retrieve packages from source. for package_source_name in packages_source_name: request = urllib2.Request(urlparse.urljoin(source_site_url, 'api/3/action/package_show'), headers = source_headers) response = urllib2.urlopen(request, urllib.quote(json.dumps(dict( id = package_source_name, )))) # CKAN 1.7 requires a POST. response_dict = json.loads(response.read()) package = conv.check(conv.pipe( before_ckan_json_to_package, conv.make_ckan_json_to_package(drop_none_values = True), after_ckan_json_to_package, ))(response_dict['result'], state = conv.default_state) if package is None: continue publisher = helpers.get_extra(package, 'publisher') organization_infos = { None: (u"OpenDataSoft", None), u"adt-et-ots-des-alpes-de-haute-provence": False, # Datasets must be merged. u"agence-bio": False, # Direct member of data.gouv.fr u"agence-des-espaces-verts-idf": (None, None), u"autolib": (None, None), u"comite-departemental-de-tourisme-du-pas-de-calais": False, u"conseil-general-des-hauts-de-seine": (None, None), u"ctc-corse": False, # Bad titles and descriptions u"direction-regionale-du-travail-de-l-emploi-et-de-la-formation-professionnelle": False, # Direct member of data.gouv.fr? u"driea-sit-del-2": (None, None), u"federation-nationale-des-bistrots-de-pays": (None, None), u"gip-corse-competences": (u"GIP Corse Compétences", None), u"iau-idf": (None, None), u"ign": False, # Direct member of data.gouv.fr u"insee": False, # Direct member of data.gouv.fr u"jcdecaux-developer": (None, None), u"la-poste": False, # Direct member of data.gouv.fr u"le-rif": (None, None), u"ministere-de-l-education-nationale": False, # Direct member of data.gouv.fr u"ministere-de-l-interieur": False, # Direct member of data.gouv.fr u"ministere-de-la-culture-et-de-la-communication": False, # Direct member of data.gouv.fr u"ministere-de-la-justice": False, # Direct member of data.gouv.fr u"ministere-des-sports": False, # Direct member of data.gouv.fr u"oehc": False, # Bad titles and descriptions u"premier-ministre-direction-de-l-information-legale-et-administrative": False, # Direct member of data.gouv.fr u"ratp": False, # Direct member of data.gouv.fr u"reseau-ferre-de-france": False, # Direct member of data.gouv.fr u"region-ile-de-france": False, # Datasets must be merged. u"sncf": (u"Société nationale des chemins de fer français", None), # Direct member of data.gouv.fr, but other datasets u"societe-nationale-des-chemins-de-fer-francais": False, # Direct member of data.gouv.fr u"ville-de-paris": (u"Mairie de Paris", None), u"ville-de-paris-direction-de-la-proprete-et-de-l-eau": (u"Mairie de Paris", u"Direction de la propreté et de l'eau"), }.get(strings.slugify(publisher)) if organization_infos is None: log.warning(u'Ignoring package "{}" from unknown publisher "{}"'.format(package['title'], publisher)) continue if organization_infos is False: continue organization_title, author = organization_infos if organization_title is None: organization_title = publisher organization = harvester.upsert_organization(dict( title = organization_title, )) package['author'] = author package.pop('groups', None) source_name = package.pop('name') package.pop('users', None) for resource in package['resources']: if resource['format'] == 'HTML': source_url = resource['url'] break else: TODO source_url = u'TODO URL' helpers.set_extra(package, u'Source', source_url) helpers.pop_extra(package, 'publisher', None) package = conv.check(conv.ckan_input_package_to_output_package)(package, state = conv.default_state) log.info(u'Harvested package: {}'.format(package['title'])) harvester.add_package(package, organization, source_name, source_url) harvester.update_target() return 0