Пример #1
0
 class Tester(db.Document):
     extras = db.ExtrasField()
Пример #2
0
class Dataset(WithMetrics, BadgeMixin, db.Owned, db.Document):
    created_at = DateTimeField(verbose_name=_('Creation date'),
                               default=datetime.now,
                               required=True)
    last_modified = DateTimeField(verbose_name=_('Last modification date'),
                                  default=datetime.now,
                                  required=True)
    title = db.StringField(required=True)
    slug = db.SlugField(max_length=255,
                        required=True,
                        populate_from='title',
                        update=True)
    description = db.StringField(required=True, default='')
    license = db.ReferenceField('License')

    tags = db.TagListField()
    resources = db.ListField(db.EmbeddedDocumentField(Resource))

    private = db.BooleanField()
    frequency = db.StringField(choices=UPDATE_FREQUENCIES.keys())
    frequency_date = db.DateTimeField(verbose_name=_('Future date of update'))
    temporal_coverage = db.EmbeddedDocumentField(db.DateRange)
    spatial = db.EmbeddedDocumentField(SpatialCoverage)

    ext = db.MapField(db.GenericEmbeddedDocumentField())
    extras = db.ExtrasField()

    featured = db.BooleanField(required=True, default=False)

    deleted = db.DateTimeField()

    def __str__(self):
        return self.title or ''

    __unicode__ = __str__

    __badges__ = {
        PIVOTAL_DATA: _('Pivotal data'),
    }

    meta = {
        'indexes': [
            '-created_at',
            'slug',
            'resources.id',
            'resources.urlhash',
        ] + db.Owned.meta['indexes'],
        'ordering': ['-created_at'],
        'queryset_class':
        DatasetQuerySet,
    }

    before_save = signal('Dataset.before_save')
    after_save = signal('Dataset.after_save')
    on_create = signal('Dataset.on_create')
    on_update = signal('Dataset.on_update')
    before_delete = signal('Dataset.before_delete')
    after_delete = signal('Dataset.after_delete')
    on_delete = signal('Dataset.on_delete')

    verbose_name = _('dataset')

    @classmethod
    def pre_save(cls, sender, document, **kwargs):
        cls.before_save.send(document)

    @classmethod
    def post_save(cls, sender, document, **kwargs):
        cls.after_save.send(document)
        if kwargs.get('created'):
            cls.on_create.send(document)
        else:
            cls.on_update.send(document)
        if document.deleted:
            cls.on_delete.send(document)

    def clean(self):
        super(Dataset, self).clean()
        if self.frequency in LEGACY_FREQUENCIES:
            self.frequency = LEGACY_FREQUENCIES[self.frequency]

    def url_for(self, *args, **kwargs):
        return url_for('datasets.show', dataset=self, *args, **kwargs)

    display_url = property(url_for)

    @property
    def external_url(self):
        return self.url_for(_external=True)

    @property
    def image_url(self):
        if self.organization:
            return self.organization.logo.url
        elif self.owner:
            return self.owner.avatar.url

    @property
    def frequency_label(self):
        return UPDATE_FREQUENCIES.get(self.frequency or 'unknown',
                                      UPDATE_FREQUENCIES['unknown'])

    def check_availability(self):
        """Check if resources from that dataset are available.

        Return a list of booleans.
        """
        # Only check remote resources.
        remote_resources = [
            resource for resource in self.resources
            if resource.filetype == 'remote'
        ]
        if not remote_resources:
            return []
        # First, we try to retrieve all data from the group (slug).
        error, response = check_url_from_group(self.slug)
        if error:
            # The group is unknown, the check will be performed by resource.
            return [
                resource.check_availability(self.slug)
                for resource in remote_resources
            ]
        else:
            return [(int(url_infos.get(
                'status', httplib.UNPROCESSABLE_ENTITY)) == httplib.OK)
                    for url_infos in response['urls']]

    @property
    def last_update(self):
        if self.resources:
            return max(resource.published for resource in self.resources)
        else:
            return self.last_modified

    @property
    def next_update(self):
        """Compute the next expected update date,

        given the frequency and last_update.
        Return None if the frequency is not handled.
        """
        delta = None
        if self.frequency == 'daily':
            delta = timedelta(days=1)
        elif self.frequency == 'weekly':
            delta = timedelta(weeks=1)
        elif self.frequency == 'fortnighly':
            delta = timedelta(weeks=2)
        elif self.frequency == 'monthly':
            delta = timedelta(weeks=4)
        elif self.frequency == 'bimonthly':
            delta = timedelta(weeks=4 * 2)
        elif self.frequency == 'quarterly':
            delta = timedelta(weeks=52 / 4)
        elif self.frequency == 'biannual':
            delta = timedelta(weeks=52 / 2)
        elif self.frequency == 'annual':
            delta = timedelta(weeks=52)
        elif self.frequency == 'biennial':
            delta = timedelta(weeks=52 * 2)
        elif self.frequency == 'triennial':
            delta = timedelta(weeks=52 * 3)
        elif self.frequency == 'quinquennial':
            delta = timedelta(weeks=52 * 5)
        if delta is None:
            return
        else:
            return self.last_update + delta

    @cached_property
    def quality(self):
        """Return a dict filled with metrics related to the inner

        quality of the dataset:

            * number of tags
            * description length
            * and so on
        """
        from udata.models import Discussion  # noqa: Prevent circular imports
        result = {}
        if not self.id:
            # Quality is only relevant on saved Datasets
            return result
        if self.next_update:
            result['frequency'] = self.frequency
            result['update_in'] = -(self.next_update - datetime.now()).days
        if self.tags:
            result['tags_count'] = len(self.tags)
        if self.description:
            result['description_length'] = len(self.description)
        if self.resources:
            result['has_resources'] = True
            result['has_only_closed_formats'] = all(
                resource.closed_format for resource in self.resources)
            result['has_unavailable_resources'] = not all(
                self.check_availability())
        discussions = Discussion.objects(subject=self)
        if discussions:
            result['discussions'] = len(discussions)
            result['has_untreated_discussions'] = not all(
                discussion.person_involved(self.owner)
                for discussion in discussions)
        result['score'] = self.compute_quality_score(result)
        return result

    def compute_quality_score(self, quality):
        """Compute the score related to the quality of that dataset."""
        score = 0
        UNIT = 2
        if 'frequency' in quality:
            # TODO: should be related to frequency.
            if quality['update_in'] < 0:
                score += UNIT
            else:
                score -= UNIT
        if 'tags_count' in quality:
            if quality['tags_count'] > 3:
                score += UNIT
        if 'description_length' in quality:
            if quality['description_length'] > 100:
                score += UNIT
        if 'has_resources' in quality:
            if quality['has_only_closed_formats']:
                score -= UNIT
            else:
                score += UNIT
            if quality['has_unavailable_resources']:
                score -= UNIT
            else:
                score += UNIT
        if 'discussions' in quality:
            if quality['has_untreated_discussions']:
                score -= UNIT
            else:
                score += UNIT
        if score < 0:
            return 0
        return score

    @classmethod
    def get(cls, id_or_slug):
        obj = cls.objects(slug=id_or_slug).first()
        return obj or cls.objects.get_or_404(id=id_or_slug)

    def add_resource(self, resource):
        '''Perform an atomic prepend for a new resource'''
        resource.validate()
        self.update(
            __raw__={
                '$push': {
                    'resources': {
                        '$each': [resource.to_mongo()],
                        '$position': 0
                    }
                }
            })
        self.reload()
        post_save.send(self.__class__, document=self)

    def update_resource(self, resource):
        '''Perform an atomic update for an existing resource'''
        index = self.resources.index(resource)
        data = {'resources__{index}'.format(index=index): resource}
        self.update(**data)
        self.reload()
        post_save.send(self.__class__, document=self)

    @property
    def community_resources(self):
        return self.id and CommunityResource.objects.filter(dataset=self) or []

    @cached_property
    def json_ld(self):
        result = {
            '@context':
            'http://schema.org',
            '@type':
            'Dataset',
            '@id':
            str(self.id),
            'alternateName':
            self.slug,
            'dateCreated':
            self.created_at.isoformat(),
            'dateModified':
            self.last_modified.isoformat(),
            'url':
            url_for('datasets.show', dataset=self, _external=True),
            'name':
            self.title,
            'keywords':
            ','.join(self.tags),
            'distribution': [resource.json_ld for resource in self.resources],
            # Theses values are not standard
            'contributedDistribution':
            [resource.json_ld for resource in self.community_resources],
            'extras':
            [self.get_json_ld_extra(*item) for item in self.extras.items()],
        }

        if self.description:
            result['description'] = mdstrip(self.description)

        if self.license and self.license.url:
            result['license'] = self.license.url

        if self.organization:
            author = self.organization.json_ld
        elif self.owner:
            author = self.owner.json_ld
        else:
            author = None

        if author:
            result['author'] = author

        return result

    @staticmethod
    def get_json_ld_extra(key, value):

        value = value.serialize() if hasattr(value, 'serialize') else value
        return {
            '@type': 'http://schema.org/PropertyValue',
            'name': key,
            'value': value,
        }
Пример #3
0
class Reuse(db.Datetimed, WithMetrics, BadgeMixin, db.Owned, db.Document):
    title = db.StringField(required=True)
    slug = db.SlugField(max_length=255,
                        required=True,
                        populate_from='title',
                        update=True,
                        follow=True)
    description = db.StringField(required=True)
    type = db.StringField(required=True, choices=list(REUSE_TYPES))
    url = db.StringField(required=True)
    urlhash = db.StringField(required=True, unique=True)
    image_url = db.StringField()
    image = db.ImageField(fs=images,
                          basename=default_image_basename,
                          max_size=IMAGE_MAX_SIZE,
                          thumbnails=IMAGE_SIZES)
    datasets = db.ListField(
        db.ReferenceField('Dataset', reverse_delete_rule=db.PULL))
    tags = db.TagListField()
    # badges = db.ListField(db.EmbeddedDocumentField(ReuseBadge))

    private = db.BooleanField()

    ext = db.MapField(db.GenericEmbeddedDocumentField())
    extras = db.ExtrasField()

    featured = db.BooleanField()
    deleted = db.DateTimeField()

    def __str__(self):
        return self.title or ''

    __badges__ = {}

    __search_metrics__ = Object(properties={
        'datasets': Integer(),
        'followers': Integer(),
        'views': Integer(),
    })

    __metrics_keys__ = [
        'discussions',
        'issues',
        'datasets',
        'followers',
        'views',
    ]

    meta = {
        'indexes': ['-created_at', 'urlhash'] + db.Owned.meta['indexes'],
        'ordering': ['-created_at'],
        'queryset_class': ReuseQuerySet,
    }

    before_save = Signal()
    after_save = Signal()
    on_create = Signal()
    on_update = Signal()
    before_delete = Signal()
    after_delete = Signal()
    on_delete = Signal()

    verbose_name = _('reuse')

    @classmethod
    def pre_save(cls, sender, document, **kwargs):
        # Emit before_save
        cls.before_save.send(document)

    @classmethod
    def post_save(cls, sender, document, **kwargs):
        if 'post_save' in kwargs.get('ignores', []):
            return
        cls.after_save.send(document)
        if kwargs.get('created'):
            cls.on_create.send(document)
        else:
            cls.on_update.send(document)
        if document.deleted:
            cls.on_delete.send(document)

    def url_for(self, *args, **kwargs):
        return url_for('reuses.show', reuse=self, *args, **kwargs)

    display_url = property(url_for)

    @property
    def is_visible(self):
        return not self.is_hidden

    @property
    def is_hidden(self):
        return len(self.datasets) == 0 or self.private or self.deleted

    @property
    def external_url(self):
        return self.url_for(_external=True)

    @property
    def type_label(self):
        return REUSE_TYPES[self.type]

    def clean(self):
        '''Auto populate urlhash from url'''
        if not self.urlhash or 'url' in self._get_changed_fields():
            self.urlhash = hash_url(self.url)

    @classmethod
    def get(cls, id_or_slug):
        obj = cls.objects(slug=id_or_slug).first()
        return obj or cls.objects.get_or_404(id=id_or_slug)

    @classmethod
    def url_exists(cls, url):
        urlhash = hash_url(url)
        return cls.objects(urlhash=urlhash).count() > 0

    @cached_property
    def json_ld(self):
        result = {
            '@context': 'http://schema.org',
            '@type': 'CreativeWork',
            'alternateName': self.slug,
            'dateCreated': self.created_at.isoformat(),
            'dateModified': self.last_modified.isoformat(),
            'url': url_for('reuses.show', reuse=self, _external=True),
            'name': self.title,
            'isBasedOnUrl': self.url,
        }

        if self.description:
            result['description'] = mdstrip(self.description)

        if self.organization:
            author = self.organization.json_ld
        elif self.owner:
            author = self.owner.json_ld
        else:
            author = None

        if author:
            result['author'] = author

        return result

    @property
    def views_count(self):
        return self.metrics.get('views', 0)

    def count_datasets(self):
        self.metrics['datasets'] = len(self.datasets)
        self.save(signal_kwargs={'ignores': ['post_save']})

    def count_discussions(self):
        from udata.models import Discussion
        self.metrics['discussions'] = Discussion.objects(subject=self,
                                                         closed=None).count()
        self.save()

    def count_issues(self):
        from udata.models import Issue
        self.metrics['issues'] = Issue.objects(subject=self,
                                               closed=None).count()
        self.save()

    def count_followers(self):
        from udata.models import Follow
        self.metrics['followers'] = Follow.objects(
            until=None).followers(self).count()
        self.save()
Пример #4
0
class ResourceMixin(object):
    id = db.AutoUUIDField(primary_key=True)
    title = db.StringField(verbose_name="Title", required=True)
    description = db.StringField()
    filetype = db.StringField(choices=RESOURCE_TYPES.keys(),
                              default='file',
                              required=True)
    url = db.URLField(required=True)
    urlhash = db.StringField()
    checksum = db.EmbeddedDocumentField(Checksum)
    format = db.StringField()
    mime = db.StringField()
    filesize = db.IntField()  # `size` is a reserved keyword for mongoengine.
    extras = db.ExtrasField()

    created_at = db.DateTimeField(default=datetime.now, required=True)
    modified = db.DateTimeField(default=datetime.now, required=True)
    published = db.DateTimeField(default=datetime.now, required=True)
    deleted = db.DateTimeField()

    def clean(self):
        super(ResourceMixin, self).clean()
        if not self.urlhash or 'url' in self._get_changed_fields():
            self.urlhash = hash_url(self.url)

    @property
    def closed_format(self):
        """Return True if the specified format is in CLOSED_FORMATS."""
        return self.format.lower() in CLOSED_FORMATS

    def check_availability(self, group):
        """Check if a resource is reachable against a Croquemort server.

        Return a boolean.
        """
        if self.filetype == 'remote':
            # We perform a quick check for performances matters.
            error, response = check_url_from_cache(self.url, group)
            if error or 'status' not in response:
                return False
            elif int(response['status']) >= httplib.INTERNAL_SERVER_ERROR:
                return False
            else:
                return True
        else:
            return True  # We consider that API cases (types) are OK.

    @property
    def is_available(self):
        return self.check_availability(group=None)

    @property
    def latest(self):
        '''
        Permanent link to the latest version of this resource.

        If this resource is updated and `url` changes, this property won't.
        '''
        return url_for('datasets.resource', id=self.id, _external=True)

    @cached_property
    def json_ld(self):

        result = {
            '@type': 'DataDownload',
            '@id': str(self.id),
            'url': self.latest,
            'name': self.title or _('Nameless resource'),
            'contentUrl': self.url,
            'dateCreated': self.created_at.isoformat(),
            'dateModified': self.modified.isoformat(),
            'datePublished': self.published.isoformat(),
        }

        if 'views' in self.metrics:
            result['interactionStatistic'] = {
                '@type': 'InteractionCounter',
                'interactionType': {
                    '@type': 'DownloadAction',
                },
                'userInteractionCount': self.metrics['views']
            }

        if self.format:
            result['encodingFormat'] = self.format

        if self.filesize:
            result['contentSize'] = self.filesize

        if self.mime:
            result['fileFormat'] = self.mime

        if self.description:
            result['description'] = mdstrip(self.description)

        # These 2 values are not standard
        if self.checksum:
            result['checksum'] = self.checksum.value,
            result['checksumType'] = self.checksum.type or 'sha1'

        return result