示例#1
0
def dataset_upserted(dataset):
    if not dataset.get('related'):
        return None
    log.debug(u'Updating dataset post in tumbler "{}".'.format(dataset['name']))
    template = templates_lookup.get_template('dataset.mako')
    body = template.render_unicode(
        conf = conf,
        dataset = dataset,
        ).strip()
    post_id_str = db.get(str(dataset['id']))
    if post_id_str is None:
        response = requests.post('https://api.tumblr.com/v2/blog/{}/post'.format(conf['tumblr.hostname']),
            auth = oauth,
            data = dict(
                body = body,
                format = 'html',
                slug = strings.slugify(dataset['name']),
                state = 'published',
                tags = 'opendata,dataviz',
                title = dataset['title'],
                type = 'text',
                ),
            headers = headers,
            )
        post_id = conv.check(conv.pipe(
            tumblr_response_to_id,
            conv.not_none,
            ))(response.text, state = conv.default_state)
        db[str(dataset['id'])] = str(post_id)
    else:
        response = requests.post('https://api.tumblr.com/v2/blog/{}/post/edit'.format(conf['tumblr.hostname']),
            auth = oauth,
            data = dict(
                body = body,
                format = 'html',
                id = int(post_id_str),
                slug = strings.slugify(dataset['name']),
                state = 'published',
                tags = 'opendata,dataviz',
                title = dataset['title'],
                type = 'text',
                ),
            headers = headers,
            )
        post_id = conv.check(conv.pipe(
            tumblr_response_to_id,
            conv.not_none,
            ))(response.text, state = conv.default_state)
示例#2
0
 def compute_words(self):
     self.words = sorted(set(strings.slugify(u'-'.join(
         fragment
         for fragment in itertools.chain(
             (
                 self._id,
                 texthelpers.textify_markdown(self.notes),
                 self.title,
                 ),
             itertools.chain(*(
                 [
                     texthelpers.textify_markdown(related_link.get('description')),
                     related_link.get('title'),
                     ]
                 for related_link in (self.related or [])
                 )),
             itertools.chain(*(
                 [
                     texthelpers.textify_markdown(resource.get('description')),
                     resource.get('format'),
                     resource.get('name'),
                     ]
                 for resource in (self.resources or [])
                 )),
             (
                 tag['name']
                 for tag in (self.tags or [])
                 ),
             )
         if fragment is not None
         )).split(u'-'))) or None
示例#3
0
 def json_to_python(self):
     enum = self.enum
     if enum is None:
         return super(EnumCol, self).json_to_python
     # This converters accepts either an item number or an item name.
     index_by_slug = self.index_by_slug
     if index_by_slug is None:
         self.index_by_slug = index_by_slug = dict(
             (strings.slugify(name), index)
             for index, name in sorted(enum._vars.iteritems() if enum is not None else ())
             )
     return conv.pipe(
         conv.condition(
             conv.test_isinstance(basestring),
             conv.pipe(
                 # Convert item name to its index.
                 conv.input_to_slug,
                 conv.test_in(index_by_slug),
                 conv.function(lambda slug: index_by_slug[slug]),
                 ),
             conv.pipe(
                 # Verify that item index belongs to enumeration.
                 conv.test_isinstance(int),
                 conv.test_in(enum._vars),
                 ),
             ),
         conv.default(
             self._default
             if self._default is not None and self._default in enum._nums
             else min(enum._vars.iterkeys())
             ),
         )
def user_extract(req):
    ctx = contexts.Ctx(req)
    user = model.get_user(ctx, check = True)
    if user.email is None:
        return wsgihelpers.forbidden(ctx)
    legislation = ctx.node
    if legislation.is_owner(ctx) and legislation.is_dated:
        return wsgihelpers.bad_request(ctx, explanation = ctx._(u'This legislation is already dated.'))

    params = req.GET
    inputs = {
        'date': params.get('date'),
        }
    data, errors = conv.struct({
        'date': conv.pipe(
            conv.french_formatted_str_to_datetime,
            conv.default(datetime.datetime.utcnow()),
            ),
        })(inputs, state = ctx)
    if errors is not None:
        return wsgihelpers.bad_request(ctx, explanation = errors)

    new_legislation = None
    new_legislation_title = ctx._(u'{} (copy {})').format(legislation.title, user.email)
    new_legislation_slug = strings.slugify(new_legislation_title)
    existing_legislations_cursor = model.Legislation.find(
        dict(
            slug = new_legislation_slug,
            ),
        as_class = collections.OrderedDict,
        )
    if existing_legislations_cursor.count() > 0:
        for existing_legislation in existing_legislations_cursor:
            if existing_legislation.is_owner(ctx):
                return wsgihelpers.redirect(ctx, location = existing_legislation.get_user_url(ctx))
        if new_legislation is None:
            return wsgihelpers.bad_request(
                ctx,
                explanation = ctx._(u'A legislation with the same name already exists.'),
                )
    else:
        new_legislation = model.Legislation(
            author_id = user._id,
            datetime_begin = legislation.datetime_begin,
            datetime_end = legislation.datetime_end,
            description = ctx._(u'Copy of legislation "{}"').format(legislation.title),
            title = new_legislation_title,
            slug = new_legislation_slug,
            )
        response = requests.post(
            conf['api.urls.legislations'],
            headers = {
                'Content-Type': 'application/json',
                'User-Agent': conf['app_name'],
                },
            data = json.dumps(dict(date = data['date'].isoformat(), legislation = legislation.json)),
            )
        new_legislation.json = response.json(object_pairs_hook = collections.OrderedDict).get('dated_legislation')
        new_legislation.save(safe = True)
    return wsgihelpers.redirect(ctx, location = new_legislation.get_user_url(ctx))
示例#5
0
 def id_or_name_or_words_to_instance(value, state = None):
     if value is None:
         return value, None
     if state is None:
         state = conv.default_state
     match = uuid_re.match(value)
     if match is None:
         self = cls.find_one(dict(name = value), as_class = collections.OrderedDict)
     else:
         self = cls.find_one(value, as_class = collections.OrderedDict)
     if self is None:
         slug = strings.slugify(value)
         words = sorted(set(slug.split(u'-')))
         instances = list(cls.find(
             dict(
                 words = {'$all': [
                     re.compile(u'^{}'.format(re.escape(word)))
                     for word in words
                     ]},
                 ),
             as_class = collections.OrderedDict,
             ).limit(2))
         if not instances:
             return value, state._(u"No organization with ID, name or words: {0}").format(value)
         if len(instances) > 1:
             return value, state._(u"Too much organizations with words: {0}").format(u' '.join(words))
         self = instances[0]
     return self, None
示例#6
0
def find_category_name(column_name, entity_name):
    """For a given column, find its category name."""
    entity_categories = fields_api_data()['columns_tree'][entity_name]['children']
    for entity_category in entity_categories:
        if column_name in entity_category['children']:
            return strings.slugify(entity_category['label'], separator = '_')
    return None
示例#7
0
文件: group.py 项目: etalab/weckan
def create_group_or_org(request, is_org):
    context = contexts.Ctx(request)
    lang = request.urlvars.get('lang', templates.DEFAULT_LANG)
    user = auth.get_user_from_request(request)
    if not user:
        return wsgihelpers.unauthorized(
            context)  # redirect to login/register ?

    form = GroupForm(request.POST, i18n=context.translator)

    if request.method == 'POST' and form.validate():
        name = strings.slugify(form.title.data)
        ckan_api(
            'organization_create' if is_org else 'group_create', user, {
                'name': name,
                'title': form.title.data,
                'description': form.description.data,
                'image_url': form.image_url.data,
            })

        redirect_url = urls.get_url(lang,
                                    'organization' if is_org else 'group',
                                    name)
        return wsgihelpers.redirect(context, location=redirect_url)

    back_url = urls.get_url(lang, 'organizations' if is_org else 'groups')
    return templates.render_site('forms/group-create-form.html',
                                 request,
                                 is_new=True,
                                 is_org=is_org,
                                 form=form,
                                 back_url=back_url)
示例#8
0
    def after_show(self, context, pkg_dict):
        try:
            cookies = tk.request.cookies
        except TypeError:
            # TypeError: No object (name: request) has been registered for this thread.
            cookies = None
        if cookies is not None:
            territory_json_str = cookies.get('territory')
            if territory_json_str:
                c = tk.c
                try:
                    c.territory = json.loads(territory_json_str)
                except ValueError:
                    pass
                else:
                    full_name = c.territory.get('full_name')
                    if full_name is not None:
                        c.territory['full_name_slug'] = strings.slugify(
                            full_name)

        # Add supplier to pkg_dict
        from ckan.lib.dictization import model_dictize
        supplier_id = pkg_dict.get('supplier_id')
        if supplier_id is not None:
            # Code derivated from model_dictize.package_dictize.
            model = context['model']
            group_rev = model.group_revision_table
            q = select([group_rev]) \
                .where(group_rev.c.id == supplier_id) \
                .where(group_rev.c.state == 'active')
            result = model_dictize._execute_with_revision(
                q, group_rev, context)
            organizations = dictization.obj_list_dictize(result, context)
            pkg_dict['supplier'] = organizations[0] if organizations else None
示例#9
0
 def json_to_python(self):
     enum = self.enum
     if enum is None:
         return conv.pipe(
             conv.test_isinstance((basestring, int)),
             conv.anything_to_int,
         )
     # This converters accepts either an item number or an item name.
     index_by_slug = self.index_by_slug
     if index_by_slug is None:
         self.index_by_slug = index_by_slug = dict(
             (strings.slugify(name), index)
             for index, name in sorted(enum._vars.iteritems()))
     return conv.pipe(
         conv.test_isinstance((basestring, int)),
         conv.condition(
             conv.anything_to_int,
             conv.pipe(
                 # Verify that item index belongs to enumeration.
                 conv.anything_to_int,
                 conv.test_in(enum._vars),
             ),
             conv.pipe(
                 # Convert item name to its index.
                 conv.input_to_slug,
                 conv.test_in(index_by_slug),
                 conv.function(lambda slug: index_by_slug[slug]),
             ),
         ),
     )
示例#10
0
 def json_to_dated_python(self):
     enum = self.enum
     if enum is None:
         return conv.pipe(
             conv.test_isinstance((basestring, int)),
             conv.anything_to_int,
             )
     # This converters accepts either an item number or an item name.
     index_by_slug = self.index_by_slug
     if index_by_slug is None:
         self.index_by_slug = index_by_slug = dict(
             (strings.slugify(name), index)
             for index, name in sorted(enum._vars.iteritems())
             )
     return conv.pipe(
         conv.test_isinstance((basestring, int)),
         conv.condition(
             conv.anything_to_int,
             conv.pipe(
                 # Verify that item index belongs to enumeration.
                 conv.anything_to_int,
                 conv.test_in(enum._vars),
                 ),
             conv.pipe(
                 # Convert item name to its index.
                 conv.input_to_slug,
                 conv.test_in(index_by_slug),
                 conv.function(lambda slug: index_by_slug[slug]),
                 ),
             ),
         )
示例#11
0
    def after_show(self, context, pkg_dict):
        try:
            cookies = tk.request.cookies
        except TypeError:
            # TypeError: No object (name: request) has been registered for this thread.
            cookies = None
        if cookies is not None:
            territory_json_str = cookies.get('territory')
            if territory_json_str:
                c = tk.c
                try:
                    c.territory = json.loads(territory_json_str)
                except ValueError:
                    pass
                else:
                    full_name = c.territory.get('full_name')
                    if full_name is not None:
                        c.territory['full_name_slug'] = strings.slugify(full_name)

        # Add supplier to pkg_dict
        from ckan.lib.dictization import model_dictize
        supplier_id = pkg_dict.get('supplier_id')
        if supplier_id is not None:
            # Code derivated from model_dictize.package_dictize.
            model = context['model']
            group_rev = model.group_revision_table
            q = select([group_rev]) \
                .where(group_rev.c.id == supplier_id) \
                .where(group_rev.c.state == 'active')
            result = model_dictize._execute_with_revision(q, group_rev, context)
            organizations = dictization.obj_list_dictize(result, context)
            pkg_dict['supplier'] = organizations[0] if organizations else None
示例#12
0
def send_stats():
    datasets_weight = [
        weight
        for weight in (
            dataset['weight']
            for dataset in metrics['datasets'].itervalues()
            )
        if weight is not None
        ]
    datasets_total_weight = sum(datasets_weight)
    global stats
    stats = dict(
        datasets_average_weight = round(datasets_total_weight / len(datasets_weight), 2),
        datasets_count = len(metrics['datasets']),
        datasets_median_weight = round(median(datasets_weight), 2),
#        datasets_median_80_percent_weight = median_80_percent(datasets_weight),
        datasets_total_weight = round(datasets_total_weight, 2),
        formats_count = len(set(
            strings.slugify(resource['format'])
            for dataset in metrics['datasets'].itervalues()
            for resource in dataset['resources']
            )),
        organizations_count = len(metrics['organizations']),
        organizations_public_services_count = sum(
            1
            for organization in metrics['organizations'].itervalues()
            if organization['public_service']
            ),
        related_count = sum(
            dataset['related_count']
            for dataset in metrics['datasets'].itervalues()
            ),
        resources_count = sum(
            len(dataset['resources'])
            for dataset in metrics['datasets'].itervalues()
            ),
        )

    request = urllib2.Request(urlparse.urljoin(conf['dactylo.site_url'], 'api/1/states'), headers = request_headers)
    request_data = dict(
        api_key = conf['dactylo.api_key'],
        value = stats,
        )
    try:
        response = urllib2.urlopen(request, json.dumps(request_data))
    except urllib2.HTTPError as response:
        log.error(u'An error occured while setting stats: {}'.format(stats))
        response_text = response.read()
        try:
            response_dict = json.loads(response_text)
        except ValueError:
            log.error(response_text)
            raise
        for key, value in response_dict.iteritems():
            print '{} = {}'.format(key, value)
        raise
    else:
        assert response.code == 200
        conv.check(cow_response_to_value)(response.read(), state = conv.default_state)
示例#13
0
 def compute_words(self):
     self.words = sorted(
         set(
             strings.slugify(u'-'.join(fragment for fragment in (
                 self._id,
                 self.email,
                 self.full_name,
             ) if fragment is not None)).split(u'-'))) or None
示例#14
0
 def compute_words(self):
     self.words = sorted(
         set(
             strings.slugify(u'-'.join(fragment for fragment in (
                 self._id,
                 #                texthelpers.textify_html(self.description),
                 #                self.title,
             ) if fragment is not None)).split(u'-'))) or None
 def setUp(self):  # noqa
     super(TestLegislations, self).setUp()
     self.ctx = contexts.Ctx()
     legislation_title = u'Legislation 1'
     self.legislation = model.Legislation(
         description = legislation_title,
         slug = strings.slugify(legislation_title),
         title = legislation_title,
         )
     self.legislation.save(safe = True)
示例#16
0
 def compute_words(self):
     self.words = sorted(set(strings.slugify(u'-'.join(
         fragment
         for fragment in (
             self._id,
             texthelpers.textify_markdown(self.description),
             self.title,
             )
         if fragment is not None
         )).split(u'-'))) or None
示例#17
0
 def name_package(self, title):
     for index in itertools.count(1):
         differentiator = u'-{}'.format(index) if index > 1 else u''
         name = u'{}{}-{}'.format(
             strings.slugify(title)[:100 - len(self.supplier_abbreviation) - 1 - len(differentiator)].rstrip(u'-'),
             differentiator,
             self.supplier_abbreviation,
             )
         if name not in self.package_by_name:
             return name
示例#18
0
 def compute_words(self):
     self.words = sorted(set(strings.slugify(u'-'.join(
         unicode(fragment)
         for fragment in (
             self._id,
             self.description,
             self.title,
             )
         if fragment is not None
         )).split(u'-'))) or None
示例#19
0
 def compute_words(self):
     self.words = sorted(set(strings.slugify(u'-'.join(
         unicode(fragment)
         for fragment in (
             self._id,
             self.email,
             self.full_name,
             )
         if fragment is not None
         )).split(u'-'))) or None
示例#20
0
    def __init__(self, admin_name = None, old_supplier_title = None, supplier_abbreviation = None,
            supplier_title = None, target_headers = None, target_site_url = None):
        if admin_name is not None:
            self.admin_name = admin_name

        if old_supplier_title is not None:
            assert isinstance(old_supplier_title, unicode)
            self.old_supplier_title = old_supplier_title
            old_supplier_name = strings.slugify(old_supplier_title)
            assert old_supplier_name
            assert len(old_supplier_name) <= 100
            self.old_supplier_name = old_supplier_name

        assert isinstance(supplier_abbreviation, unicode)
        assert supplier_abbreviation == strings.slugify(supplier_abbreviation)
        assert 1 < len(supplier_abbreviation) < 5
        self.supplier_abbreviation = supplier_abbreviation

        assert isinstance(supplier_title, unicode)
        self.supplier_title = supplier_title
        supplier_name = strings.slugify(supplier_title)
        assert supplier_name
        assert len(supplier_name) <= 100
        self.supplier_name = supplier_name

        assert isinstance(target_headers, dict)
        assert isinstance(target_headers['Authorization'], basestring)
        assert isinstance(target_headers['User-Agent'], basestring)
        self.target_headers = target_headers

        assert isinstance(target_site_url, unicode)
        self.target_site_url = target_site_url

        self.existing_packages_name = set()
        self.group_by_name = {}
        self.organization_by_name = {}
        self.organization_name_by_package_name = {}
        self.package_by_name = {}
        self.package_source_by_name = {}
        self.packages_by_organization_name = {}
        self.related_by_package_name = {}
示例#21
0
def get_cookie(request):
    if request.cookies.get('territory-infos', '').count('|') == 1:
        territory_key, _ = request.cookies.get('territory-infos').split('|')
        territory = fetch(*territory_key.split('/')) if territory_key else {}
    else:
        territory = {}

    return {
        'full_name': territory.get('full_name', ''),
        'full_name_slug': strings.slugify(territory.get('full_name', '')),
        'depcom': territory.get('code', '')
    }
示例#22
0
def build_slug(title, previous=None):
    base_slug = strings.slugify(title)[:PACKAGE_NAME_MAX_LENGTH]
    exists_query = DB.query(Package.name)
    slug_exists = lambda s: exists_query.filter(Package.name == s).count() > 0
    if base_slug == previous or not slug_exists(base_slug):
        return base_slug
    idx = 0
    while True:
        suffix = '-{0}'.format(idx)
        slug = ''.join([base_slug[:-len(suffix)], suffix])
        if slug == previous or not slug_exists(slug):
            return slug
        idx += 1
示例#23
0
def duplicate(req):
    ctx = contexts.Ctx(req)
    test_case = ctx.node
    user = model.get_user(ctx, check = True)
    new_test_case_title = ctx._(u'Copy of {}').format(test_case.title)
    new_test_case = model.TestCase(
        author_id = user._id,
        description = new_test_case_title,
        title = new_test_case_title,
        slug = strings.slugify(new_test_case_title),
        )
    new_test_case.save(safe = True)
    return wsgihelpers.redirect(ctx, location = user.get_user_url(ctx))
示例#24
0
文件: group.py 项目: etalab/weckan
def edit_group_or_org(request, is_org):
    context = contexts.Ctx(request)
    lang = request.urlvars.get('lang', templates.DEFAULT_LANG)
    user = auth.get_user_from_request(request)
    if not user:
        return wsgihelpers.unauthorized(
            context)  # redirect to login/register ?

    group_name = request.urlvars.get('name')
    group = Group.by_name(group_name)
    if not group:
        return wsgihelpers.not_found(context)
    form = GroupForm(request.POST, group, i18n=context.translator)

    if request.method == 'POST' and form.validate():
        name = strings.slugify(form.title.data)
        extras = [{
            'key': key,
            'value': value
        } for key, value in group.extras.items()]
        ckan_api(
            'organization_update' if is_org else 'group_update', user, {
                'id': group.id,
                'name': name,
                'title': form.title.data,
                'description': form.description.data,
                'image_url': form.image_url.data,
                'extras': extras,
                'users': _get_members(group),
            })

        redirect_url = urls.get_url(lang,
                                    'organization' if is_org else 'group',
                                    name)
        return wsgihelpers.redirect(context, location=redirect_url)

    group_type = 'organization' if is_org else 'group'
    group_base_url = urls.get_url(lang, group_type)
    back_url = urls.get_url(lang, group_type, group.name)
    delete_url = urls.get_url(lang, group_type, 'delete', group.name)
    return templates.render_site('forms/group-edit-form.html',
                                 request,
                                 is_org=is_org,
                                 form=form,
                                 group_base_url=group_base_url,
                                 group=group,
                                 back_url=back_url,
                                 delete_url=delete_url)
示例#25
0
    def compute_attributes(self):
        url_name = conv.check(conv.input_to_url_name)(self.name)
        if url_name is None:
            if self.url_name is not None:
                del self.url_name
        else:
            self.url_name = url_name

        self.words = sorted(
            set(
                strings.slugify(u'-'.join(fragment for fragment in (
                    unicode(self._id),
                    self.name,
                ) if fragment is not None)).split(u'-'))) or None

        return self
示例#26
0
    def save(self, *args, **kwargs):
        if self.upload_at is None:
            self.upload_at = datetime.datetime.utcnow()

        # find a unique slug based on filename
        if self.slug is None:
            slug = slugify(self.filename)
            distinguish = 1
            while True:
                proposal = slug if distinguish == 1 else '%s-%d' % (slug, distinguish)
                if not Projects.find_one({'slug': proposal}):
                    slug = proposal
                    break
                distinguish += 1
            self.slug = slug

        return super(Projects, self).save(*args, **kwargs)
示例#27
0
 def convert_element_to_article(self, element, updated):
     title_url = None
     for xpath in (
             './/h1',
             './/h2',
             './/h3',
             './/h4',
             './/h5',
             './/h6',
     ):
         heading_elements = element.xpath(xpath)
         if len(heading_elements) > 0:
             title = lxml.html.tostring(heading_elements[0],
                                        encoding=unicode,
                                        method='text').strip()
             # Remove header from article element.
             header_element = None
             for ancestor_element in iter_element_ancestors(
                     heading_elements[0]):
                 if ancestor_element.tag in ('a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup') \
                         or ancestor_element.tag == 'div' and ancestor_element.get('class') == 'page-header':
                     header_element = ancestor_element
                 if ancestor_element.tag == 'a':
                     url, error = conv.pipe(
                         conv.make_input_to_url(),
                         conv.not_none,
                     )(ancestor_element.get('href'), state=self.ctx)
                     if error is None:
                         title_url = url
             header_element.getparent().remove(header_element)
             break
     else:
         title = None
     return dict(
         element=element,
         hash=element.get('id') or strings.slugify(title),
         id=element.get('id'),
         node=self,
         title=title,
         title_url=title_url,
         updated=get_element_time(element, default=updated),
     )
示例#28
0
def edit(req):
    ctx = contexts.Ctx(req)
    user = model.get_user(ctx, check = True)
    params = req.params
    inputs = {
        'title': params.get('title'),
        'description': params.get('description'),
        }
    data, errors = conv.struct({
        'title': conv.cleanup_line,
        'description': conv.cleanup_line,
        })(inputs, state = ctx)
    if errors is not None:
        return wsgihelpers.bad_request(ctx, explanation = errors)
    test_case = ctx.node
    test_case.description = data['description']
    test_case.slug = strings.slugify(data['title'])
    test_case.title = data['title']
    test_case.save(safe = True)
    return wsgihelpers.redirect(ctx, location = user.get_user_url(ctx))
示例#29
0
文件: forms.py 项目: etalab/weckan
def handle_upload(request, field, user=None):
    from ckan.controllers import storage

    if not isinstance(field.data, cgi.FieldStorage):
        return None

    filename, ext = splitext(field.data.filename)
    filename = strings.slugify(filename)
    filename = ''.join([filename, ext])
    filename = '{ts:%Y-%m-%dT%H-%M-%S}/{name}'.format(name=filename, ts=datetime.now())
    ofs = storage.get_ofs()
    ofs.put_stream(STORAGE_BUCKET, filename, field.data.file, {
        'filename-original': field.data.filename,
        'uploaded-by': user.name if user else '',
    })
    root = conf['home_url']
    if root.startswith('//'):
        root = root.replace('//', 'https://' if conf['https'] else 'http://', 1)
    path = urls.get_url(None, 'storage/f', filename)
    return ''.join([root, path])
示例#30
0
 def convert_element_to_article(self, ctx, element, updated):
     title_url = None
     for xpath in (
             './/h1',
             './/h2',
             './/h3',
             './/h4',
             './/h5',
             './/h6',
             ):
         heading_elements = element.xpath(xpath)
         if len(heading_elements) > 0:
             title = lxml.html.tostring(heading_elements[0], encoding = unicode, method = 'text').strip()
             # Remove header from article element.
             header_element = None
             for ancestor_element in iter_element_ancestors(heading_elements[0]):
                 if ancestor_element.tag in ('a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup') \
                         or ancestor_element.tag == 'div' and ancestor_element.get('class') == 'page-header':
                     header_element = ancestor_element
                 if ancestor_element.tag == 'a':
                     url, error = conv.pipe(
                         conv.make_input_to_url(),
                         conv.not_none,
                         )(ancestor_element.get('href'), state = ctx)
                     if error is None:
                         title_url = url
             header_element.getparent().remove(header_element)
             break
     else:
         title = None
     return dict(
         element = element,
         hash = element.get('id') or strings.slugify(title),
         id = element.get('id'),
         node = self,
         title = title,
         title_url = title_url,
         updated = get_element_time(element, default = updated),
         )
示例#31
0
文件: conv.py 项目: Gentux/etalage
def csv_infos_to_csv_bytes(csv_infos_by_schema_name, state = None):
    from . import ramdb
    if csv_infos_by_schema_name is None:
        return None, None
    if state is None:
        state = default_state
    csv_bytes_by_name = {}
    for schema_name, csv_infos in csv_infos_by_schema_name.iteritems():
        csv_file = StringIO()
        writer = csv.writer(csv_file, delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
        writer.writerow([
            (label or u'').encode("utf-8")
            for label in csv_infos['columns_label']
            ])
        for row in csv_infos['rows']:
            writer.writerow([
                unicode(cell).encode('utf-8') if cell is not None else None
                for cell in row
                ])
        csv_filename = '{0}.csv'.format(strings.slugify(ramdb.schema_title_by_name.get(schema_name, schema_name)))
        csv_bytes_by_name[csv_filename] = csv_file.getvalue()
    return csv_bytes_by_name or None, None
示例#32
0
文件: forms.py 项目: etalab/weckan
def handle_upload(request, field, user=None):
    from ckan.controllers import storage

    if not isinstance(field.data, cgi.FieldStorage):
        return None

    filename, ext = splitext(field.data.filename)
    filename = strings.slugify(filename)
    filename = ''.join([filename, ext])
    filename = '{ts:%Y-%m-%dT%H-%M-%S}/{name}'.format(name=filename,
                                                      ts=datetime.now())
    ofs = storage.get_ofs()
    ofs.put_stream(
        STORAGE_BUCKET, filename, field.data.file, {
            'filename-original': field.data.filename,
            'uploaded-by': user.name if user else '',
        })
    root = conf['home_url']
    if root.startswith('//'):
        root = root.replace('//', 'https://' if conf['https'] else 'http://',
                            1)
    path = urls.get_url(None, 'storage/f', filename)
    return ''.join([root, path])
示例#33
0
文件: group.py 项目: etalab/weckan
def create_group_or_org(request, is_org):
    context = contexts.Ctx(request)
    lang = request.urlvars.get('lang', templates.DEFAULT_LANG)
    user = auth.get_user_from_request(request)
    if not user:
        return wsgihelpers.unauthorized(context)  # redirect to login/register ?

    form = GroupForm(request.POST, i18n=context.translator)

    if request.method == 'POST' and form.validate():
        name = strings.slugify(form.title.data)
        ckan_api('organization_create' if is_org else 'group_create', user, {
            'name': name,
            'title': form.title.data,
            'description': form.description.data,
            'image_url': form.image_url.data,
        })

        redirect_url = urls.get_url(lang, 'organization' if is_org else 'group', name)
        return wsgihelpers.redirect(context, location=redirect_url)

    back_url = urls.get_url(lang, 'organizations' if is_org else 'groups')
    return templates.render_site('forms/group-create-form.html', request,
        is_new=True, is_org=is_org, form=form, back_url=back_url)
示例#34
0
文件: group.py 项目: etalab/weckan
def edit_group_or_org(request, is_org):
    context = contexts.Ctx(request)
    lang = request.urlvars.get('lang', templates.DEFAULT_LANG)
    user = auth.get_user_from_request(request)
    if not user:
        return wsgihelpers.unauthorized(context)  # redirect to login/register ?

    group_name = request.urlvars.get('name')
    group = Group.by_name(group_name)
    if not group:
        return wsgihelpers.not_found(context)
    form = GroupForm(request.POST, group, i18n=context.translator)

    if request.method == 'POST' and form.validate():
        name = strings.slugify(form.title.data)
        extras = [{'key': key, 'value': value} for key, value in group.extras.items()]
        ckan_api('organization_update' if is_org else 'group_update', user, {
            'id': group.id,
            'name': name,
            'title': form.title.data,
            'description': form.description.data,
            'image_url': form.image_url.data,
            'extras': extras,
            'users': _get_members(group),
        })

        redirect_url = urls.get_url(lang, 'organization' if is_org else 'group', name)
        return wsgihelpers.redirect(context, location=redirect_url)

    group_type = 'organization' if is_org else 'group'
    group_base_url = urls.get_url(lang, group_type)
    back_url = urls.get_url(lang, group_type, group.name)
    delete_url = urls.get_url(lang, group_type, 'delete', group.name)
    return templates.render_site('forms/group-edit-form.html', request,
        is_org=is_org, form=form, group_base_url=group_base_url, group=group,
        back_url=back_url, delete_url=delete_url)
示例#35
0
def login(req):
    """Authorization request"""
    ctx = contexts.Ctx(req)

    assert req.method == 'POST'
    params = req.POST
    inputs = dict(assertion=params.get('assertion'), )
    data, errors = conv.struct(
        dict(assertion=conv.pipe(
            conv.cleanup_line,
            conv.not_none,
        ), ), )(inputs, state=ctx)
    if errors is not None:
        return wsgihelpers.bad_request(
            ctx, explanation=ctx._(u'Login Error: {0}').format(errors))

    response = requests.post(
        'https://verifier.login.persona.org/verify',
        data=dict(
            audience=urls.get_full_url(ctx),
            assertion=data['assertion'],
        ),
        verify=True,
    )
    if not response.ok:
        return wsgihelpers.internal_error(
            ctx,
            dump=response.text,
            explanation=ctx._(
                u'Error while verifying authentication assertion'),
        )
    verification_data = json.loads(response.content)
    # Check if the assertion was valid.
    if verification_data['status'] != 'okay':
        return wsgihelpers.internal_error(
            ctx,
            dump=response.text,
            explanation=ctx._(
                u'Error while verifying authentication assertion'),
        )

    user = model.Account.find_one(
        dict(email=verification_data['email'], ),
        as_class=collections.OrderedDict,
    )
    if user is None:
        user = model.Account()
        user._id = unicode(uuid.uuid4())
        user.api_key = unicode(uuid.uuid4())
        user.email = verification_data['email']
        user.full_name = verification_data['email']
        user.slug = strings.slugify(user.full_name)
        user.compute_attributes()
        user.save(ctx, safe=True)
    ctx.user = user

    session = ctx.session
    if session is None:
        ctx.session = session = model.Session()
        session.synchronizer_token = unicode(uuid.uuid4())
        session.token = unicode(uuid.uuid4())
    session.expiration = datetime.datetime.utcnow() + datetime.timedelta(
        hours=4)
    session.user_id = user._id
    session.save(ctx, safe=True)

    if req.cookies.get(conf['cookie']) != session.token:
        req.response.set_cookie(conf['cookie'],
                                session.token,
                                httponly=True,
                                secure=req.scheme == 'https')
    return 'Login succeeded.'
示例#36
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('config', help='path of configuration file')
    parser.add_argument(
        'csv_file_path',
        help='path of CSV file containing the groups to use by organization')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='increase output verbosity')

    global args
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.DEBUG if args.verbose else logging.WARNING,
        stream=sys.stdout)

    config_parser = ConfigParser.SafeConfigParser(
        dict(here=os.path.dirname(
            os.path.abspath(os.path.normpath(args.config))), ))
    config_parser.read(args.config)
    conf = conv.check(
        conv.pipe(
            conv.test_isinstance(dict),
            conv.struct(
                {
                    'ckan.api_key':
                    conv.pipe(
                        conv.cleanup_line,
                        conv.not_none,
                    ),
                    'ckan.site_url':
                    conv.pipe(
                        conv.make_input_to_url(error_if_fragment=True,
                                               error_if_path=True,
                                               error_if_query=True,
                                               full=True),
                        conv.not_none,
                    ),
                    'user_agent':
                    conv.pipe(
                        conv.cleanup_line,
                        conv.not_none,
                    ),
                },
                default='drop',
            ),
            conv.not_none,
        ))(dict(config_parser.items('Change-datasets-groups-by-organization')),
           conv.default_state)

    ckan_headers = {
        'Authorization': conf['ckan.api_key'],
        'User-Agent': conf['user_agent'],
    }

    group_by_name = {}
    groups_name_by_organization_name = {}
    organization_by_id = {}
    organization_by_name = {}
    with open(args.csv_file_path) as csv_file:
        csv_reader = csv.reader(csv_file)
        csv_reader.next()
        for row in csv_reader:
            organization_title, group1_title, group2_title = [
                cell.decode('utf-8').strip() or None for cell in row
            ]
            if organization_title is None or group1_title is None:
                continue
            organization_name = strings.slugify(organization_title)[:100]
            if organization_name not in organization_by_name:
                request = urllib2.Request(urlparse.urljoin(
                    conf['ckan.site_url'],
                    '/api/3/action/organization_show?id={}'.format(
                        organization_name)),
                                          headers=ckan_headers)
                try:
                    response = urllib2.urlopen(request)
                except urllib2.HTTPError as response:
                    if response.code == 404:
                        log.warning(
                            u'Skipping missing organization: {}'.format(
                                organization_name))
                        continue
                    raise
                else:
                    response_dict = json.loads(response.read())
                    organization = response_dict['result']
                    organization_by_id[organization['id']] = organization
                    organization_by_name[organization_name] = organization
            for group_title in (group1_title, group2_title):
                if group_title is None:
                    continue
                group_name = strings.slugify(group_title)[:100]
                if group_name not in group_by_name:
                    request = urllib2.Request(urlparse.urljoin(
                        conf['ckan.site_url'],
                        '/api/3/action/group_show?id={}'.format(group_name)),
                                              headers=ckan_headers)
                    try:
                        response = urllib2.urlopen(request)
                    except urllib2.HTTPError as response:
                        if response.code == 404:
                            log.info(u'Creating group: {}'.format(group_name))
                            request = urllib2.Request(urlparse.urljoin(
                                conf['ckan.site_url'],
                                '/api/3/action/group_create'),
                                                      headers=ckan_headers)
                            response = urllib2.urlopen(
                                request,
                                urllib.quote(
                                    json.dumps(
                                        dict(name=group_name,
                                             title=group_title))))
                            response_dict = json.loads(response.read())
                            group_by_name[group_name] = response_dict['result']
                        else:
                            raise
                    else:
                        response_dict = json.loads(response.read())
                        group_by_name[group_name] = response_dict['result']
                groups_name_by_organization_name.setdefault(
                    organization_name, set()).add(group_name)

    # Retrieve names of packages already existing in CKAN.
    request = urllib2.Request(urlparse.urljoin(conf['ckan.site_url'],
                                               '/api/3/action/package_list'),
                              headers=ckan_headers)
    response = urllib2.urlopen(request)
    response_dict = json.loads(response.read())
    packages_name = conv.check(
        conv.pipe(
            conv.ckan_json_to_name_list,
            conv.not_none,
        ))(response_dict['result'], state=conv.default_state)

    for package_name in packages_name:
        request = urllib2.Request(urlparse.urljoin(
            conf['ckan.site_url'],
            '/api/3/action/package_show?id={}'.format(package_name)),
                                  headers=ckan_headers)
        response = urllib2.urlopen(request)
        response_dict = json.loads(response.read())
        package = conv.check(
            conv.pipe(
                conv.make_ckan_json_to_package(drop_none_values=True),
                conv.not_none,
                conv.ckan_input_package_to_output_package,
            ))(response_dict['result'], state=conv.default_state)
        organization_id = package.get('owner_org')
        organization = organization_by_id.get(organization_id)
        if organization is None:
            continue
        groups_name = set(group['name']
                          for group in (package.get('groups') or []))
        organization_groups_name = groups_name_by_organization_name[
            organization['name']]
        for group_name in organization_groups_name:
            if group_name not in groups_name:
                log.info(u'Adding group {} to package {}'.format(
                    group_name, package['name']))
                request = urllib2.Request(urlparse.urljoin(
                    conf['ckan.site_url'], '/api/3/action/member_create'),
                                          headers=ckan_headers)
                response = urllib2.urlopen(
                    request,
                    urllib.quote(
                        json.dumps(
                            dict(
                                capacity='public',
                                id=group_name,
                                object=package['name'],
                                object_type='package',
                            ))))
                response_dict = json.loads(response.read())
        for group_name in groups_name:
            if group_name not in organization_groups_name:
                log.info(u'Removing group {} from package {}'.format(
                    group_name, package['name']))
                request = urllib2.Request(urlparse.urljoin(
                    conf['ckan.site_url'], '/api/3/action/member_delete'),
                                          headers=ckan_headers)
                response = urllib2.urlopen(
                    request,
                    urllib.quote(
                        json.dumps(
                            dict(
                                id=group_name,
                                object=package['name'],
                                object_type='package',
                            ))))

    return 0
示例#37
0
def login(req):
    """Authorization request"""
    ctx = contexts.Ctx(req)

    assert req.method == 'POST'
    params = req.POST
    inputs = dict(
        assertion = params.get('assertion'),
        )
    data, errors = conv.struct(
        dict(
            assertion = conv.pipe(
                conv.cleanup_line,
                conv.not_none,
                ),
            ),
        )(inputs, state = ctx)
    if errors is not None:
        return wsgihelpers.bad_request(ctx, explanation = ctx._(u'Login Error: {0}').format(errors))

    response = requests.post('https://verifier.login.persona.org/verify',
        data = dict(
            audience = urls.get_full_url(ctx),
            assertion = data['assertion'],
            ),
        verify = True,
        )
    if not response.ok:
        return wsgihelpers.internal_error(ctx,
            dump = response.text,
            explanation = ctx._(u'Error while verifying authentication assertion'),
            )
    verification_data = json.loads(response.content)
    # Check if the assertion was valid.
    if verification_data['status'] != 'okay':
        return wsgihelpers.internal_error(ctx,
            dump = response.text,
            explanation = ctx._(u'Error while verifying authentication assertion'),
            )

    user = model.Account.find_one(
        dict(
            email = verification_data['email'],
            ),
        as_class = collections.OrderedDict,
        )
    if user is None:
        user = model.Account()
        user._id = unicode(uuid.uuid4())
        user.api_key = unicode(uuid.uuid4())
        user.email = verification_data['email']
        user.full_name = verification_data['email']
        user.slug = strings.slugify(user.full_name)
        user.compute_attributes()
        user.save(ctx, safe = True)
    ctx.user = user

    session = ctx.session
    if session is None:
        ctx.session = session = model.Session()
        session.synchronizer_token = unicode(uuid.uuid4())
        session.token = unicode(uuid.uuid4())
    session.expiration = datetime.datetime.utcnow() + datetime.timedelta(hours = 4)
    session.user_id = user._id
    session.save(ctx, safe = True)

    if req.cookies.get(conf['cookie']) != session.token:
        req.response.set_cookie(conf['cookie'], session.token, httponly = True, secure = req.scheme == 'https')
    return 'Login succeeded.'
def main():
    parser = argparse.ArgumentParser(description = __doc__)
    parser.add_argument('config', help = 'path of configuration file')
    parser.add_argument('-d', '--dry-run', action = 'store_true',
        help = "simulate harvesting, don't update CKAN repository")
    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity')

    global args
    args = parser.parse_args()
    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout)

    config_parser = ConfigParser.SafeConfigParser(dict(
        here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))),
        ))
    config_parser.read(args.config)
    conf = conv.check(conv.pipe(
        conv.test_isinstance(dict),
        conv.struct(
            {
                'ckan.api_key': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'ckan.site_url': conv.pipe(
                    conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True,
                        full = True),
                    conv.not_none,
                    ),
                'user_agent': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                },
            default = 'drop',
            ),
        conv.not_none,
        ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state)

    harvester = helpers.Harvester(
        supplier_abbreviation = u'dp',
        supplier_title = u'Data Publica',
        target_headers = {
            'Authorization': conf['ckan.api_key'],
            'User-Agent': conf['user_agent'],
            },
        target_site_url = conf['ckan.site_url'],
        )
    source_headers = {
        'User-Agent': conf['user_agent'],
        }
    source_site_url = u'http://www.data-publica.com/etalab/export'

    if not args.dry_run:
        harvester.retrieve_target()

    groups_title = set()
    # Retrieve packages from source.
    for page_index in itertools.count():
        page_url = urlparse.urljoin(source_site_url, u'?p={}'.format(page_index))
        log.info(u"Harvesting page {}".format(page_url))
        request = urllib2.Request(page_url.encode('utf-8'), headers = source_headers)
        response = urllib2.urlopen(request)
        response_dict = json.loads(response.read())
        publications = response_dict['publications']
        if not publications:
            break
        for publication in publications:
            publication = conv.check(conv.pipe(
                validate_publication,
                conv.not_none,
                ))(publication, state = conv.default_state)

            groups_title.update(publication['groups'] or [])
            if not args.dry_run:
                groups = [
                    harvester.upsert_group(dict(
                        title = group_title,
                        ))
                    for group_title in (publication['groups'] or [])
                    ]

            organization_title = publication['organization']
            if not args.dry_run:
                if organization_title is None:
                    organization = harvester.supplier
                else:
                    organization = harvester.upsert_organization(dict(
                        title = organization_title,
                        ))

            package = dict(
                license_id = license_id_by_name[publication['licenseName']],
                notes = publication['notes'],
                resources = [
                    dict(
                        name = resource['name'],
                        format = format_by_mime_type[resource['mimeType']],
                        url = urlparse.urljoin(source_site_url, resource['url']),
                        )
                    for resource in (publication['resources'] or [])
                    ],
                tags = [
                    dict(name = strings.slugify(tag_name))
                    for tag_name in sorted(set(
                        tag['name']
                        for tag in (publication['tags'] or [])
                        ))
                    ],
#                territorial_coverage = u'Country/FR/FRANCE',
                title = publication['name'],
                )
            source_url = urlparse.urljoin(source_site_url, publication['url'])
            helpers.set_extra(package, u'Source', source_url)

            log.info(u'Harvested package: {}'.format(package['title']))
            if not args.dry_run:
                harvester.add_package(package, organization, publication['name'], source_url, groups = groups)

    if not args.dry_run:
        harvester.update_target()

    log.info(u'Groups: {}'.format(sorted(groups_title)))

    return 0
def main():
    parser = argparse.ArgumentParser(description = __doc__)
    parser.add_argument('config', help = 'path of configuration file')
    parser.add_argument('-d', '--dry-run', action = 'store_true',
        help = "simulate harvesting, don't update CKAN repository")
    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity')

    global args
    args = parser.parse_args()
    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout)

    config_parser = ConfigParser.SafeConfigParser(dict(
        here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))),
        ))
    config_parser.read(args.config)
    conf = conv.check(conv.pipe(
        conv.test_isinstance(dict),
        conv.struct(
            {
                'ckan.api_key': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'ckan.site_url': conv.pipe(
                    conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True,
                        full = True),
                    conv.not_none,
                    ),
                'user_agent': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                },
            default = 'drop',
            ),
        conv.not_none,
        ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state)

    harvester = helpers.Harvester(
        supplier_abbreviation = u'prs',
        supplier_title = u'Mairie de Paris',
        target_headers = {
            'Authorization': conf['ckan.api_key'],
            'User-Agent': conf['user_agent'],
            },
        target_site_url = conf['ckan.site_url'],
        )
    source_headers = {
        'User-Agent': conf['user_agent'],
        }
    source_site_url = u'http://opendata.paris.fr/opendata/rest/ckan/'

    if not args.dry_run:
        harvester.retrieve_target()

    # Retrieve names of packages in source.
    request = urllib2.Request(urlparse.urljoin(source_site_url, 'api/3/action/package_list'), headers = source_headers)
    response = urllib2.urlopen(request)
    response_dict = json.loads(response.read(), encoding = 'cp1252')
    packages_source_name = conv.check(conv.pipe(
        conv.ckan_json_to_name_list,
        conv.not_none,
        ))(response_dict['result'], state = conv.default_state)

    # Retrieve packages from source.
    for package_source_name in packages_source_name:
        request = urllib2.Request(urlparse.urljoin(source_site_url, u'api/3/action/package_show?id={}'.format(
            package_source_name)).encode('utf-8'), headers = source_headers)
        try:
            response = urllib2.urlopen(request)
        except urllib2.HTTPError, response:
            if response.code == 404:
                log.warning(u'Skipping package {}, because page not found'.format(package_source_name))
                continue
            raise
        response_dict = json.loads(response.read())
        if not response_dict['success']:
            log.warning(u'Skipping package {}, because {}'.format(package_source_name, response_dict))
            continue
        source_package = conv.check(conv.pipe(
            before_ckan_json_to_package,
            conv.make_ckan_json_to_package(drop_none_values = True),
            conv.not_none,
            after_ckan_json_to_package,
            ))(response_dict['result'], state = conv.default_state)
        if source_package is None:
            continue

        package = dict(
#            frequency = source_package.get('frequency'),
            license_id = {
                u'ODbL': u'odc-odbl',
                }[source_package.get('license_id')],
            notes = source_package.get('notes'),
            title = source_package['title'],
            resources = [
                dict(
                    created = resource['created'],
                    format = resource.get('format'),
                    last_modified = resource.get('last_modified'),
                    name = resource.get('name') or u'Fichier.{}'.format(resource.get('format')).strip(u'.'),
                    url = resource['url'],
                    )
                for resource in (source_package.get('resources') or [])
                if resource.get('url') is not None
                ],
            tags = [
                dict(name = tag_name)
                for tag_name in sorted(set(
                    strings.slugify(tag['name'])
                    for tag in (source_package.get('tags') or [])
                    ))
                if tag_name and len(tag_name) > 2
                ],
#            temporal_coverage_from = source_package.get('temporal_coverage_from'),
#            temporal_coverage_to = source_package.get('temporal_coverage_to'),
            territorial_coverage = u'CommuneOfFrance/75056/75000 PARIS',
            url = source_package['url'],
            )

        if not args.dry_run:
            groups = source_package.get('groups')
            if groups is not None:
                groups = [
                    harvester.upsert_group(dict(
                        # Don't reuse image and description of groups, because Etalab has its own.
                        # description = group.get(u'description'),
                        # image_url = group.get(u'image_url'),
                        title = group_title,
                        ))
                    for group_title in sorted(list(set(
                        group['title']
                        for group in (groups or [])
                        )) + [u"Territoires et Transports"])
                    ]

        log.info(u'Harvested package: {}'.format(package['title']))
        if not args.dry_run:
            harvester.add_package(package, harvester.supplier, source_package['name'], package['url'], groups = groups)
def main():
    parser = argparse.ArgumentParser(description = __doc__)
    parser.add_argument('config', help = 'path of configuration file')
    parser.add_argument('download_dir', help = 'directory where are stored downloaded HTML pages')
    parser.add_argument('-d', '--dry-run', action = 'store_true',
        help = "simulate harvesting, don't update CKAN repository")
    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity')

    global args
    args = parser.parse_args()
    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout)

    config_parser = ConfigParser.SafeConfigParser(dict(
        here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))),
        ))
    config_parser.read(args.config)
    conf = conv.check(conv.pipe(
        conv.test_isinstance(dict),
        conv.struct(
            {
                'ckan.api_key': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'ckan.site_url': conv.pipe(
                    conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True,
                        full = True),
                    conv.not_none,
                    ),
                'user_agent': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                },
            default = 'drop',
            ),
        conv.not_none,
        ))(dict(config_parser.items('Etalab-CKAN-Harvesters')), conv.default_state)

    harvester = helpers.Harvester(
        admin_name = u'b-dot-kessler-at-agglo-rennesmetropole-dot-fr',
        supplier_abbreviation = u'rm',
        supplier_title = u'Rennes Métropole en accès libre',
        target_headers = {
            'Authorization': conf['ckan.api_key'],
            'User-Agent': conf['user_agent'],
            },
        target_site_url = conf['ckan.site_url'],
        )

    # Retrieve paths of HTML pages to convert.
    data_dir = os.path.join(args.download_dir, 'data')
    assert os.path.exists(data_dir), "Data directory {0} doesn't exist".format(data_dir)
    data_file_path_by_number = {}
    for (dir, directories_name, filenames) in os.walk(data_dir):
        for directory_name in directories_name[:]:
            if directory_name.startswith('.'):
                directories_name.remove(directory_name)
        for filename in filenames:
            data_file_path = os.path.join(dir, filename)
            match = data_filename_re.match(os.path.basename(data_file_path))
            assert match is not None, data_file_path
            data_number = int(match.group('number'))
            data_file_path_by_number[data_number] = data_file_path

    if not args.dry_run:
        harvester.retrieve_target()

    # Convert source HTML packages to CKAN JSON.
    for data_number, data_file_path in sorted(data_file_path_by_number.iteritems()):
        with open(data_file_path) as data_file:
            try:
                data_str = data_file.read()
                data_html = etree.fromstring(data_str, html_parser)
                html_base_list = data_html.xpath('head/base[@href]')
                base_url = html_base_list[0].get('href')

                dataset_html = data_html.xpath('.//div[@class="tx_icsopendatastore_pi1_single"]')[0]
                assert dataset_html is not None
                title_str = dataset_html.xpath('.//h3')[0].text.strip()
                assert title_str

                publisher_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_publisher separator"]/p[@class="value description"]')
                publisher_str = publisher_html_list[0].text.strip() or None if publisher_html_list else None

                contact_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_contact separator"]/p[@class="value description"]')
                contact_str = contact_html_list[0].text.strip() or None if contact_html_list else None

                creator_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_creator separator"]/p[@class="value description"]')
                creator_str = creator_html_list[0].text.strip() or None if creator_html_list else None

                owner_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_owner separator"]/p[@class="value owner"]')
                owner_str = owner_html_list[0].text.strip() or None if owner_html_list else None
                organization_title, author = conv.check(conv.pipe(
                    conv.test_in(organization_titles_by_owner_str),
                    conv.translate(organization_titles_by_owner_str),
                    conv.default((u"Rennes Métropole", None)),
                    ))(owner_str, state = conv.default_state)
                if not args.dry_run:
                    organization = harvester.upsert_organization(dict(
                        title = organization_title,
                        ))

                categories_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_categories separator"]/p[@class="value description"]')
                categories_str = categories_html_list[0].text.strip() or None if categories_html_list else None
                tags = [
                    dict(name = tag_name)
                    for tag_name in sorted(set(
                        strings.slugify(category_fragment)
                        for category_str in categories_str.split(u',')
                        for category_fragment in category_str.split(u':')
                        ))
                    ]
                if not args.dry_run:
                    groups = [
                        harvester.upsert_group(dict(
                            title = categories_str.split(u',')[0].strip(),
                            )),
                        harvester.upsert_group(dict(
                            title = u'Territoires et Transports',
                            )),
                        ] if categories_str else None

                release_date_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_releasedate separator"]/p[@class="value description"]')
                release_date_str = release_date_html_list[0].text if release_date_html_list else None
                release_date_iso8601_str = conv.check(conv.pipe(
                    french_input_to_date,
                    conv.date_to_iso8601_str,
                    ))(release_date_str, state = conv.default_state)

                update_date_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_updatedate separator"]/p[@class="value description"]')
                update_date_str = update_date_html_list[0].text if update_date_html_list else None
                update_date_iso8601_str = conv.check(conv.pipe(
                    french_input_to_date,
                    conv.date_to_iso8601_str,
                    ))(update_date_str, state = conv.default_state)

                frequency_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_updatefrequency separator"]/p[@class="value description"]')
                frequency_str = frequency_html_list[0].text if frequency_html_list else None
                frequency = conv.check(conv.pipe(
                    conv.cleanup_line,
                    conv.test_in(frequency_translations),
                    conv.translate(frequency_translations),
                    ))(frequency_str, state = conv.default_state)

                description_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_description separator"]/p[@class="value description"]')
                description_str = description_html_list[0].text.strip() or None if description_html_list else None

                technical_data_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_technical_data separator"]'
                    '/p[@class="value technical_data"]')
                technical_data_str = technical_data_html_list[0].text.strip() or None if technical_data_html_list \
                    else None

                license_html_list = dataset_html.xpath(
                    './/div[@class="tx_icsopendatastore_pi1_licence separator"]/p[@class="value owner"]/a')
                license_str = license_html_list[0].text if license_html_list else None
                license_id = conv.check(conv.pipe(
                    conv.cleanup_line,
                    conv.test_in(license_id_by_str),
                    conv.translate(license_id_by_str),
                    ))(license_str, state = conv.default_state)

                resources = []
                for resource_html in dataset_html.xpath('.//div[@class="tx_icsopendatastore_pi1_file"]'):
                    resource_url = urlparse.urljoin(base_url, resource_html.xpath('.//a[@href]')[0].get('href'))
                    resource_path = urlparse.urlsplit(resource_url)
                    filename = resource_url.rstrip('/').rsplit(u'/', 1)[-1] or u'Fichier'
                    if not filename or fi
                    resources.append(dict(
                        created = release_date_iso8601_str,
                        format = resource_html.xpath('.//span[@class="coin"]')[0].text.strip() or None,
                        last_modified = update_date_iso8601_str,
                        name = filename,
                        url = resource_url,
                        ))
            except:
                print 'An exception occured in file {0}'.format(data_number)
                raise

        package = dict(
            author = author,
            frequency = frequency,
            license_id = license_id,
            maintainer = contact_str,
            notes = description_str,
            resources = resources,
            tags = tags,
            territorial_coverage = u'IntercommunalityOfFrance/243500139/CA RENNES METROPOLE',
            title = title_str,
            url = u'http://www.data.rennes-metropole.fr/les-donnees/catalogue/?tx_icsopendatastore_pi1[uid]={}'
                .format(data_number),
            )
        helpers.set_extra(package, u'Données techniques', technical_data_str)
        helpers.set_extra(package, u'Éditeur', publisher_str)
        helpers.set_extra(package, u'Auteur', creator_str)

        if not args.dry_run:
            harvester.add_package(package, organization, package['title'], package['url'], groups = groups)

    if not args.dry_run:
        harvester.update_target()

    return 0
def main():
    parser = argparse.ArgumentParser(description = __doc__)
    parser.add_argument('config', help = 'path of configuration file')
    parser.add_argument('-v', '--verbose', action = 'store_true', help = 'increase output verbosity')

    global args
    args = parser.parse_args()
    logging.basicConfig(level = logging.DEBUG if args.verbose else logging.WARNING, stream = sys.stdout)

    config_parser = ConfigParser.SafeConfigParser(dict(
        here = os.path.dirname(os.path.abspath(os.path.normpath(args.config))),
        ))
    config_parser.read(args.config)
    conf = conv.check(conv.pipe(
        conv.test_isinstance(dict),
        conv.struct(
            {
                'ckan.api_key': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'ckan.site_url': conv.pipe(
                    conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True,
                        full = True),
                    conv.not_none,
                    ),
                'opendatasoft.ckan.password': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'opendatasoft.ckan.site_url': conv.pipe(
                    conv.make_input_to_url(error_if_fragment = True, error_if_path = True, error_if_query = True,
                        full = True),
                    conv.not_none,
                    ),
                'opendatasoft.ckan.username': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                'user_agent': conv.pipe(
                    conv.cleanup_line,
                    conv.not_none,
                    ),
                },
            default = 'drop',
            ),
        conv.not_none,
        ))(dict(config_parser.items('Etalab-OpenDataSoft-Harvester')), conv.default_state)

    harvester = helpers.Harvester(
        supplier_abbreviation = u'ods',
        supplier_title = u"OpenDataSoft",
        target_headers = {
            'Authorization': conf['ckan.api_key'],
            'User-Agent': conf['user_agent'],
            },
        target_site_url = conf['ckan.site_url'],
        )
    source_headers = {
        'Authorization': 'Basic {}'.format(base64.encodestring('{}:{}'.format(conf['opendatasoft.ckan.username'],
            conf['opendatasoft.ckan.password'])).replace('\n', '')),
        'User-Agent': conf['user_agent'],
        }
    source_site_url = conf['opendatasoft.ckan.site_url']

    harvester.retrieve_target()

    # Retrieve names of packages in source.
    request = urllib2.Request(urlparse.urljoin(source_site_url, 'api/3/action/package_list'),
        headers = source_headers)
    response = urllib2.urlopen(request)
    response_dict = json.loads(response.read())
    packages_source_name = conv.check(conv.pipe(
        conv.ckan_json_to_name_list,
        conv.not_none,
        ))(response_dict['result'], state = conv.default_state)

    # Retrieve packages from source.
    for package_source_name in packages_source_name:
        request = urllib2.Request(urlparse.urljoin(source_site_url, 'api/3/action/package_show'),
            headers = source_headers)
        response = urllib2.urlopen(request, urllib.quote(json.dumps(dict(
                id = package_source_name,
                ))))  # CKAN 1.7 requires a POST.
        response_dict = json.loads(response.read())
        package = conv.check(conv.pipe(
            before_ckan_json_to_package,
            conv.make_ckan_json_to_package(drop_none_values = True),
            after_ckan_json_to_package,
            ))(response_dict['result'], state = conv.default_state)
        if package is None:
            continue

        publisher = helpers.get_extra(package, 'publisher')
        organization_infos = {
            None: (u"OpenDataSoft", None),
            u"adt-et-ots-des-alpes-de-haute-provence": False,  # Datasets must be merged.
            u"agence-bio": False,  # Direct member of data.gouv.fr
            u"agence-des-espaces-verts-idf": (None, None),
            u"autolib": (None, None),
            u"comite-departemental-de-tourisme-du-pas-de-calais": False,
            u"conseil-general-des-hauts-de-seine": (None, None),
            u"ctc-corse": False,  # Bad titles and descriptions
            u"direction-regionale-du-travail-de-l-emploi-et-de-la-formation-professionnelle": False,  # Direct member of data.gouv.fr?
            u"driea-sit-del-2": (None, None),
            u"federation-nationale-des-bistrots-de-pays": (None, None),
            u"gip-corse-competences": (u"GIP Corse Compétences", None),
            u"iau-idf": (None, None),
            u"ign": False,  # Direct member of data.gouv.fr
            u"insee": False,  # Direct member of data.gouv.fr
            u"jcdecaux-developer": (None, None),
            u"la-poste": False,  # Direct member of data.gouv.fr
            u"le-rif": (None, None),
            u"ministere-de-l-education-nationale": False,  # Direct member of data.gouv.fr
            u"ministere-de-l-interieur": False,  # Direct member of data.gouv.fr
            u"ministere-de-la-culture-et-de-la-communication": False,  # Direct member of data.gouv.fr
            u"ministere-de-la-justice": False,  # Direct member of data.gouv.fr
            u"ministere-des-sports": False,  # Direct member of data.gouv.fr
            u"oehc": False,  # Bad titles and descriptions
            u"premier-ministre-direction-de-l-information-legale-et-administrative": False,  # Direct member of data.gouv.fr
            u"ratp": False,  # Direct member of data.gouv.fr
            u"reseau-ferre-de-france": False,  # Direct member of data.gouv.fr
            u"region-ile-de-france": False,  # Datasets must be merged.
            u"sncf": (u"Société nationale des chemins de fer français", None),  # Direct member of data.gouv.fr, but other datasets
            u"societe-nationale-des-chemins-de-fer-francais": False,  # Direct member of data.gouv.fr
            u"ville-de-paris": (u"Mairie de Paris", None),
            u"ville-de-paris-direction-de-la-proprete-et-de-l-eau": (u"Mairie de Paris",
                u"Direction de la propreté et de l'eau"),
            }.get(strings.slugify(publisher))
        if organization_infos is None:
            log.warning(u'Ignoring package "{}" from unknown publisher "{}"'.format(package['title'], publisher))
            continue
        if organization_infos is False:
            continue
        organization_title, author = organization_infos
        if organization_title is None:
            organization_title = publisher
        organization = harvester.upsert_organization(dict(
            title = organization_title,
            ))

        package['author'] = author
        package.pop('groups', None)
        source_name = package.pop('name')
        package.pop('users', None)

        for resource in package['resources']:
            if resource['format'] == 'HTML':
                source_url = resource['url']
                break
        else:
            TODO
            source_url = u'TODO URL'
        helpers.set_extra(package, u'Source', source_url)
        helpers.pop_extra(package, 'publisher', None)

        package = conv.check(conv.ckan_input_package_to_output_package)(package, state = conv.default_state)
        log.info(u'Harvested package: {}'.format(package['title']))
        harvester.add_package(package, organization, source_name, source_url)

    harvester.update_target()

    return 0