示例#1
0
文件: cluster.py 项目: keho98/argos
    def members(cls):
        """
        Build the members attribute from the
        subclass's `__members__` class attribute.

        Example::

            __members__ = {'class_name': 'Article', 'secondary': events_articles, 'backref_name': 'events'}
        """
        args = cls.__members__

        return db.relationship(args['class_name'],
                secondary=args['secondary'],
                backref=db.backref(args['backref_name']))
示例#2
0
class ConceptConceptAssociation(BaseConceptAssociation):
    from_concept_slug = db.Column(db.String,
                                  db.ForeignKey('concept.slug',
                                                ondelete='CASCADE',
                                                onupdate='CASCADE'),
                                  primary_key=True)
    concept_slug = db.Column(db.String,
                             db.ForeignKey('concept.slug',
                                           ondelete='CASCADE',
                                           onupdate='CASCADE'),
                             primary_key=True)
    concept = db.relationship('Concept',
                              backref=db.backref('from_concept_associations'),
                              foreign_keys=[concept_slug])
示例#3
0
    def mentions(cls):
        """
        Build the mentions attribute from the
        subclass's `__mentions__` class attribute.

        Example::

            __mentions__ = {'secondary': articles_mentions, 'backref_name': 'articles'}
        """
        args = cls.__mentions__

        return db.relationship('Alias',
                               secondary=args['secondary'],
                               backref=db.backref(args['backref_name']))
示例#4
0
文件: cluster.py 项目: keho98/argos
    def entities(cls):
        """
        Build the entities attribute from the
        subclass's `__entities__` class attribute.

        Example::

            __entities__ = {'secondary': articles_entities, 'backref_name': 'articles'}
        """
        args = cls.__entities__

        return db.relationship('Entity',
                secondary=args['secondary'],
                backref=db.backref(args['backref_name']))
示例#5
0
文件: cluster.py 项目: keho98/argos
    def members(cls):
        """
        Build the members attribute from the
        subclass's `__members__` class attribute.

        Example::

            __members__ = {'class_name': 'Article', 'secondary': events_articles, 'backref_name': 'events'}
        """
        args = cls.__members__

        return db.relationship(args['class_name'],
                               secondary=args['secondary'],
                               backref=db.backref(args['backref_name']))
示例#6
0
    def concept_associations(cls):
        """
        Build the concepts relationship from the
        subclass's `__concepts__` class attribute.

        This uses an Associated Object so we can
        keep track of an additional property: the
        importance score of a particular concept to a
        given clusterable. The clusterable's concepts are
        directly accessed through the `concepts` property.

        The association model should inherit from BaseConceptAssociation.

        Example::

            __concepts__ = {'association_model': ArticleConceptAssociation,
                            'backref_name': 'article'}
        """
        args = cls.__concepts__

        return db.relationship(args['association_model'],
                               backref=db.backref(args['backref_name']),
                               cascade='all, delete, delete-orphan',
                               order_by=args['association_model'].score.desc())
示例#7
0
    def concept_associations(cls):
        """
        Build the concepts relationship from the
        subclass's `__concepts__` class attribute.

        This uses an Associated Object so we can
        keep track of an additional property: the
        importance score of a particular concept to a
        given clusterable. The clusterable's concepts are
        directly accessed through the `concepts` property.

        The association model should inherit from BaseConceptAssociation.

        Example::

            __concepts__ = {'association_model': ArticleConceptAssociation,
                            'backref_name': 'article'}
        """
        args = cls.__concepts__

        return db.relationship(args['association_model'],
                backref=db.backref(args['backref_name']),
                cascade='all, delete, delete-orphan',
                order_by=args['association_model'].score.desc())
示例#8
0
class User(Model, UserMixin):
    """
    A user

    Attributes:

        * id -> Integer (Primary Key)
        * email -> String (Unique)
        * password -> String (Unique)
        * active -> Bool
        * confirmed_at -> DateTime
        * roles -> [Role]
    """
    id              = db.Column(db.Integer(), primary_key=True)
    email           = db.Column(db.String(255), unique=True)
    image           = db.Column(db.String(255), unique=True)
    name            = db.Column(db.String(255), unique=True)
    password        = db.Column(db.String(255))
    active          = db.Column(db.Boolean())
    confirmed_at    = db.Column(db.DateTime())
    auths           = db.relationship('Auth', backref='user', lazy='dynamic')
    roles           = db.relationship('Role', secondary=roles_users,
                            backref=db.backref('users', lazy='dynamic'))
    watching        = db.relationship('Story', secondary=users_stories,
                            backref=db.backref('watchers', lazy='joined'))
    bookmarked      = db.relationship('Event', secondary=users_events,
                            backref=db.backref('bookmarkers', lazy='joined'))
    created_at      = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at      = db.Column(db.DateTime, default=datetime.utcnow)

    def __init__(self, auth=None, **kwargs):
        for key in kwargs:
            setattr(self, key, kwargs[key])

    def add_provider(self, provider, provider_id, access_token, access_token_secret=None, update=True):
        """
        Add a new provider authentication to this user.

        Raises an AuthExistsForUserException if this authentication
        already exists and is associated with another user.

        Args:
            | provider (str)            -- the provider name, e.g. 'twitter'
            | provider_id (str)         -- the id assigned by the provider
            | access_token (str)        -- the access token
            | access_token_secret (str) -- the access token secret
            | update (bool)             -- whether or not to update the existing
                                        provider authentication, if found (default: True)
        """
        # Check to see if this auth already exists.
        auth = Auth.for_provider(provider, provider_id)
        if auth:
            if auth.user is not self:
                raise AuthExistsForUserException('Found an existing authorization for {0} associated with another user.'.format(provider))
            elif update:
                auth.update_token(access_token, access_token_secret)
        else:
            auth = Auth(provider, provider_id, access_token, access_token_secret)
            auth.user = self
            db.session.add(auth)

        db.session.commit()
        return auth

    def merge(self, user):
        """
        Merge this user with another user,
        where *this* user is considered the canonical
        user (i.e. its attributes are preferred over
        the other user's).

        UI tip: prompt the user to pick which account is their primary one!
        """
        providers = [auth.provider for auth in self.auths]
        for auth in user.auths:
            # In the event that the merged user has authentications
            # which conflict with one on this user, prefer the one on this user.
            # I don't anticipate this will happen, but it's possible, e.g. if a user
            # has two twitter accts and authenticates each on different user accts here.
            if auth.provider not in providers:
                auth.user = self
        db.session.delete(user)
        db.session.commit()

    @staticmethod
    def for_provider(provider, provider_id):
        """
        Find an User instance by provider.

        Args:
            | provider (str)        -- the provider name, e.g. 'twitter'
            | provider_id (str)     -- the user id assigned by the provider
        """
        auth = Auth.for_provider(provider, provider_id)
        if auth:
            return auth.user
        return None
示例#9
0
class Concept(Model):
    """
    An concept,
    which could be a place, person,
    organization, topic, etc.

    You should *not* set the `slug` or `uri`;
    they are set automatically according to the `name`.
    In the spirit of Python's developer maturity,
    you're trusted not to modify them.
    """
    name = db.Column(db.UnicodeText)
    slug = db.Column(db.String(255), primary_key=True)
    uri = db.Column(db.String)
    summary = db.Column(db.UnicodeText)
    image = db.Column(db.String)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    aliases = db.relationship('Alias', backref='concept', lazy='joined')
    commonness = db.Column(db.Float, default=0.0)

    # Mapping concepts to concepts,
    # and tracking mentions of other concepts in this concept's summary.
    mentions = db.relationship('Alias',
                               secondary=concepts_mentions,
                               backref=db.backref('concepts'))
    concept_associations = db.relationship(
        ConceptConceptAssociation,
        foreign_keys=[ConceptConceptAssociation.from_concept_slug],
        backref=db.backref('from_concept'),
        cascade='all, delete-orphan')

    _sources = ['Wikipedia', 'DBpedia']

    def __init__(self, name):
        """
        Initialize a concept by a name, which can be
        an alias (it does not have to be the canonical name).
        This specified name will be saved as an Alias.

        A canonical name will be looked for; if one is found
        it will be used as the slug for this Concept.
        """
        self.aliases.append(Alias(name))

        # Try to get a canonical URI
        # and derive the slug from that.
        self.uri = knowledge.uri_for_name(name)
        if self.uri:
            self.slug = self.uri.split('/')[-1]
            k = knowledge.knowledge_for(uri=self.uri, fallback=True)
            self.commonness = knowledge.commonness_for_uri(self.uri)

            self.summary = k['summary']
            self.name = k['name']

            # Download the image.
            if k['image'] is not None:
                ext = splitext(k['image'])[-1].lower()
                self.image = storage.save_from_url(
                    k['image'], '{0}{1}'.format(hash(self.slug), ext))

        # If no URI was found,
        # generate our own slug.
        # Note: A problem here is that it assumes that
        # this particular name is the canonical one,
        # and that we don't collect any information for it.
        else:
            self.slug = slugify(name)
            # Commonness is set to default of 0.0,
            # which makes sense because if there's no URI for it
            # it probably is not common at all.

    @property
    def names(self):
        return [alias.name for alias in self.aliases]

    @property
    def sources(self):
        """
        Returns the data sources
        used for this concept.
        """
        return self._sources

    @property
    def concepts(self):
        """
        Returns the concepts this
        concept points *to*,
        with their importance scores
        for this concept.
        """
        if self.summary and not len(self.concept_associations):
            self.conceptize()

        def with_score(assoc):
            assoc.concept.score = assoc.score
            return assoc.concept

        return list(map(with_score, self.concept_associations))

    @property
    def from_concepts(self):
        """
        Returns the concepts that
        points to this concept,
        with their importance scores
        for this concept.
        """
        def with_score(assoc):
            assoc.from_concept.score = assoc.score
            return assoc.from_concept

        return list(map(with_score, self.from_concept_associations))

    @property
    def stories(self):
        """
        Return the stories associated with this concept,
        adding an additional "relatedness" value
        which is the concept's importance score for
        a particular story.
        """
        def with_score(assoc):
            assoc.story.relatedness = assoc.score
            return assoc.story

        return list(map(with_score, self.story_associations))

    @property
    def events(self):
        """
        Same as the `stories` property
        but for events.
        """
        def with_score(assoc):
            assoc.event.relatedness = assoc.score
            return assoc.event

        return list(map(with_score, self.event_associations))

    @property
    def articles(self):
        """
        Same as the `stories` property
        but for articles.
        """
        def with_score(assoc):
            assoc.article.relatedness = assoc.score
            return assoc.article

        return list(map(with_score, self.article_associations))

    @property
    def related_concepts(self):
        return self.to_concepts + self.from_concepts

    @property
    def profile(self):
        """
        Returns a data profile specifically
        for this concept's type.
        """
        if not hasattr(self, '_profile') or not self._profile:
            self._profile = knowledge.profiles.get_profile(self.uri)
            self._sources += self._profile.get('sources', [])
        return self._profile

    def conceptize(self):
        """
        Process the concept summary for concepts,
        and add the appropriate mentions.
        """
        concepts = []
        for c_name in gx.concepts(self.summary):
            # Search for the concept.
            uri = knowledge.uri_for_name(c_name)

            if uri:
                slug = uri.split('/')[-1]
            else:
                slug = slugify(c_name)
            c = Concept.query.get(slug)

            # If an concept is found...
            if c:
                # Add this name as a new alias, if necessary.
                alias = Alias.query.filter_by(name=c_name, concept=c).first()
                if not alias:
                    alias = Alias(c_name)
                    c.aliases.append(alias)
                self.mentions.append(alias)

            # If one doesn't exist, create a new one.
            if not c:
                c = Concept(c_name)
                self.mentions.append(c.aliases[0])
                db.session.add(c)
                db.session.commit()

            concepts.append(c)

        # Score the concepts' importance.
        total_found = len(concepts)
        counter = Counter(concepts)
        uniq_concepts = set(concepts)

        assocs = []
        for concept in uniq_concepts:
            score = (counter[concept] - concept.commonness) / total_found
            assoc = ConceptConceptAssociation(concept, score)
            assocs.append(assoc)

        self.concept_associations = assocs
示例#10
0
class Article(Clusterable):
    """
    An article.
    """
    __tablename__ = 'article'
    __concepts__ = {
        'association_model': ArticleConceptAssociation,
        'backref_name': 'article'
    }
    __mentions__ = {'secondary': articles_mentions, 'backref_name': 'articles'}
    title = db.Column(db.Unicode)
    text = db.Column(db.UnicodeText)
    html = db.Column(db.UnicodeText)
    ext_url = db.Column(db.Unicode)
    image = db.Column(db.String)
    ignore = db.Column(db.Boolean, default=False)
    score = db.Column(db.Float, default=0.0)
    source_id = db.Column(db.Integer, db.ForeignKey('source.id'))
    feed_id = db.Column(db.Integer, db.ForeignKey('feed.id'))
    node_id = db.Column(db.Integer, unique=True, index=True)
    authors = db.relationship('Author',
                              secondary=articles_authors,
                              backref=db.backref('articles', lazy='dynamic'))

    # There are some articles which are just noise, and we want to ignore them using regexes for their titles.
    ignore_patterns = [
        # NYT country profiles
        re.compile(r'[A-Z].+\sprofile( - Overview)?')
    ]

    def __str__(self):
        return self.title

    def __repr__(self):
        return self.title

    def __init__(self, **kwargs):
        for key in kwargs:
            setattr(self, key, kwargs[key])

        if self.text is not None:
            self.conceptize()

        if self.score is None:
            self.score = 0.0

        self.check_ignored()

    def check_ignored(self):
        for pattern in self.ignore_patterns:
            if pattern.match(self.title):
                self.ignore = True
                break
        else:
            self.ignore = False
        return self.ignore

    def conceptize(self):
        """
        Process the article text for concepts,
        and add the appropriate mentions.
        """
        concepts = []
        for c_name in gx.concepts(self.text):
            # Search for the concept.
            uri = knowledge.uri_for_name(c_name)

            if uri:
                slug = uri.split('/')[-1]
            else:
                slug = slugify(c_name)
            c = Concept.query.get(slug)

            # If an concept is found...
            if c:
                # Add this name as a new alias, if necessary.
                alias = Alias.query.filter_by(name=c_name, concept=c).first()
                if not alias:
                    alias = Alias(c_name)
                    c.aliases.append(alias)
                # Avoid duplicate aliases.
                if alias not in self.mentions:
                    self.mentions.append(alias)

            # If one doesn't exist, create a new one.
            if not c:
                c = Concept(c_name)
                self.mentions.append(c.aliases[0])
                db.session.add(c)
                db.session.commit()

            concepts.append(c)

        # Score the concepts' importance.
        total_found = len(concepts)
        counter = Counter(concepts)
        uniq_concepts = set(concepts)

        assocs = []
        for concept in uniq_concepts:
            score = counter[concept] / total_found
            assoc = ArticleConceptAssociation(concept, score)
            assocs.append(assoc)

        self.concept_associations = assocs

    @property
    def published(self):
        """Convert datetime to seconds"""
        # If not timezone is set, assume UTC.
        # super annoying and it's probably not a good guess but it's
        # all we got for now.
        # In production, we will be setting article publish times as utc when
        # we fetch them, so it should be less of a problem there.
        if self.created_at.tzinfo is None:
            created_at = self.created_at.replace(tzinfo=pytz.UTC)
        delta = created_at - epoch
        return delta.total_seconds()
示例#11
0
class Article(Clusterable):
    """
    An article.
    """
    __tablename__ = 'article'
    __entities__ = {'secondary': articles_entities, 'backref_name': 'articles'}
    vectors = db.Column(db.PickleType)
    title = db.Column(db.Unicode)
    text = db.Column(db.UnicodeText)
    html = db.Column(db.UnicodeText)
    ext_url = db.Column(db.Unicode)
    image = db.Column(db.String())
    source_id = db.Column(db.Integer, db.ForeignKey('source.id'))
    authors = db.relationship('Author',
                              secondary=articles_authors,
                              backref=db.backref('articles', lazy='dynamic'))

    def __init__(self, **kwargs):
        for key in kwargs:
            setattr(self, key, kwargs[key])

        if self.text is not None:
            self.entitize()
            self.vectorize()

    def vectorize(self):
        """
        Returns a tuple of vectors representing this article.

        Articles are represented by:
            (bag of words vector, entities vector)
        """
        if self.vectors is None:
            bow_vec = vectorize(self.text)
            ent_vec = vectorize(' '.join(entities(self.text)))
            self.vectors = [bow_vec, ent_vec]
        return self.vectors

    def entitize(self):
        """
        Process the article text for entities.
        """
        ents = []
        for e_name in entities(self.text):
            # TO DO: Need to find a way of getting canonical name.

            # Search for the entity.
            slug = slugify(e_name)
            e = Entity.query.get(slug)

            # If one doesn't exist, create a new one.
            if not e:
                e = Entity(e_name)
                db.session.add(e)
                db.session.commit()
            ents.append(e)
        self.entities = ents

    def similarity(self, article):
        """
        Calculate the similarity between this article
        and another article.
        """
        # Compare the text vectors,
        # and the entity vectors.
        v = self.vectorize()
        v_ = article.vectorize()

        # Linearly combine the similarity values,
        # weighing them according to these coefficients.
        # [text vector, entity vector, publication date]
        coefs = [2, 1, 2]
        sim = 0
        for i, vec in enumerate(v):
            dist = jaccard(v_[i], v[i])

            # Two empty vectors returns a jaccard distance of NaN.
            # Set it to be 1, i.e. consider them completely different
            # (or, put more clearly, they have nothing in common)
            # FYI if jaccard runs on empty vectors, it will throw a warning.
            if isnan(dist):
                dist = 1
            s = 1 - dist
            sim += (coefs[i] * s)

        # Also take publication dates into account.
        ideal_time = 259200  # 3 days, in seconds
        t, t_ = self.created_at, article.created_at

        # Subtract the more recent time from the earlier time.
        time_diff = t - t_ if t > t_ else t_ - t
        time_diff = time_diff.total_seconds()

        # Score is normalized [0, 1], where 1 is within the ideal time,
        # and approaches 0 the longer the difference is from the ideal time.
        time_score = 1 if time_diff < ideal_time else ideal_time / time_diff
        sim += (coefs[2] * time_score)

        # Normalize back to [0, 1].
        return sim / sum(coefs)