Python db_query_top_articles示例，hoaxy.ir.search.db_query_top_articles Python示例

示例#1

0

显示文件

文件： api.py 项目： naushadzaman/hoaxy-backend

def query_top_articles():
    """Handle API request 'top-articles'

    API Request Parameters
    ----------------------
        upper_day : string formatted datetime
        most_recent : bool

    API Response Keys
    -----------------
        status : string
        num_of_entries : int
        articles : dict
            canonical_url : string
            date_captured : string formatted datetime
            number_of_tweets : int
            site_type : {'claim', 'fact_checking'}
            title : string
            upper_day : string formatted datetime
    """
    lucene.getVMEnv().attachCurrentThread()
    yesterday = datetime.utcnow().date() - timedelta(days=1)
    yesterday = yesterday.strftime('%Y-%m-%d')
    q_top_article_schema = Schema({
        Optional('upper_day', default=yesterday):
        And(Regex('^\d{4}-\d{2}-\d{2}$'),
            Use(dateutil.parser.parse),
            error='Invalid date, shoul be yyyy-mm-dd format'),
        Optional('most_recent', default=True):
        And(unicode,
            Use(lambda s: s.lower()), lambda s: s in ('true', 'false'),
            Use(lambda s: True if s == 'true' else False)),
        Optional('exclude_tags', default=[]):
        And(Use(eval), error='Invalid exclude_tags input format'),
    })
    q_kwargs = copy_req_args(request.args)
    try:
        q_kwargs = q_top_article_schema.validate(q_kwargs)
        df = db_query_top_articles(engine, **q_kwargs)
        if len(df) == 0:
            raise APINoResultError('No top article found!')
        response = dict(
            status='OK',
            num_of_entries=len(df),
            articles=flask.json.loads(df.to_json(**TO_JSON_KWARGS)))
    except SchemaError as e:
        response = dict(status='ERROR', error=str(e))
    except APINoResultError as e:
        response = dict(status='No result error', error=str(e))
    except Exception as e:
        logger.exception(e)
        response = dict(status='ERROR', error='Server error, query failed')
    return flask.jsonify(response)

示例#2

0

显示文件

    def look_up_top_articles(cls, session, upper_day, most_recent):
        """Look up top articles.

        Parameters
        ----------
        session : object
            An instance of SQLAlchemy Session.
        upper_day : date
            The right edge of 30 days window.
        most_recent : bool
            If no result in 30 days windows with `upper_day`, whether return
            most recently available data.
        """
        df = db_query_top_articles(session, upper_day, most_recent)
        if len(df) == 0:
            logger.warning('No result found!')
            cls.print_psql(df)
        else:
            logger.info('Top articles for day %s', upper_day)
            cls.print_psql(df)

示例#3

0

显示文件

    def generate_top_articles(cls, session, upper_day):
        """Generate top articles.

        Parameters
        ----------
        session : object
            An instance of SQLAlchemy Session.
        upper_day : Date
            The right edge of 30 days window.
        """
        df = db_query_top_articles(session,
                                   upper_day,
                                   most_recent=False,
                                   exclude_tags=[])
        if len(df) > 0:
            logger.warning('Top spreaders for upper_day %s, already exist!',
                           upper_day)
            cls.print_psql(df)
            return
        month_delta = relativedelta(days=30)
        lower_day = upper_day - month_delta
        # top 20 most sharing articles for 'fact_checking'
        q1 = """
        INSERT INTO top20_article_monthly (upper_day, date_captured, title,
            canonical_url, site_type, number_of_tweets)
        SELECT :upper_day AS upper_day,
            a.date_captured AS date_captured,
            a.title AS title,
            a.canonical_url AS canonical_url,
            'fact_checking' AS site_type,
            COUNT(DISTINCT tw.id) AS number_of_tweets
        FROM article AS a
            JOIN url AS u ON u.article_id=a.id
            JOIN ass_tweet_url AS atu ON atu.url_id=u.id
            JOIN tweet AS tw ON tw.id=atu.tweet_id
            JOIN site AS s ON s.id=u.site_id
        WHERE a.date_captured BETWEEN :lower_day AND :upper_day
                AND tw.created_at BETWEEN :lower_day AND :upper_day
                AND s.site_type LIKE 'fact_checking'
        GROUP BY a.date_captured, a.title, a.canonical_url
        ORDER BY number_of_tweets DESC
        LIMIT 20
        """
        # top 20 most sharing articles for 'claim'
        q2 = """
        INSERT INTO top20_article_monthly (upper_day, date_captured, title,
            canonical_url, site_type, number_of_tweets)
        SELECT :upper_day AS upper_day,
            a.date_captured AS date_captured,
            a.title AS title,
            a.canonical_url AS canonical_url,
            'claim' AS site_type,
            COUNT(DISTINCT tw.id) AS number_of_tweets
        FROM article AS a
            JOIN url AS u ON u.article_id=a.id
            JOIN ass_tweet_url AS atu ON atu.url_id=u.id
            JOIN tweet AS tw ON tw.id=atu.tweet_id
            JOIN site AS s ON s.id=u.site_id
        WHERE a.date_captured BETWEEN :lower_day AND :upper_day
                AND tw.created_at BETWEEN :lower_day AND :upper_day
                AND s.site_type LIKE 'claim'
        GROUP BY a.date_captured, a.title, a.canonical_url
        ORDER BY number_of_tweets DESC
        LIMIT 20
        """
        session.execute(
            text(q1).bindparams(lower_day=lower_day, upper_day=upper_day))
        session.execute(
            text(q2).bindparams(lower_day=lower_day, upper_day=upper_day))
        try:
            session.commit()
            df = db_query_top_articles(session, upper_day)
            if len(df) == 0:
                logger.warning('No new top articles found!')
                cls.print_psql(df)
            else:
                logger.info('Insert new top articles for day %s', upper_day)
                cls.print_psql(df)
        except SQLAlchemyError as e:
            logger.error(e)
            session.rollback()