def query_top_articles(): """Handle API request 'top-articles' API Request Parameters ---------------------- upper_day : string formatted datetime most_recent : bool API Response Keys ----------------- status : string num_of_entries : int articles : dict canonical_url : string date_captured : string formatted datetime number_of_tweets : int site_type : {'claim', 'fact_checking'} title : string upper_day : string formatted datetime """ lucene.getVMEnv().attachCurrentThread() yesterday = datetime.utcnow().date() - timedelta(days=1) yesterday = yesterday.strftime('%Y-%m-%d') q_top_article_schema = Schema({ Optional('upper_day', default=yesterday): And(Regex('^\d{4}-\d{2}-\d{2}$'), Use(dateutil.parser.parse), error='Invalid date, shoul be yyyy-mm-dd format'), Optional('most_recent', default=True): And(unicode, Use(lambda s: s.lower()), lambda s: s in ('true', 'false'), Use(lambda s: True if s == 'true' else False)), Optional('exclude_tags', default=[]): And(Use(eval), error='Invalid exclude_tags input format'), }) q_kwargs = copy_req_args(request.args) try: q_kwargs = q_top_article_schema.validate(q_kwargs) df = db_query_top_articles(engine, **q_kwargs) if len(df) == 0: raise APINoResultError('No top article found!') response = dict( status='OK', num_of_entries=len(df), articles=flask.json.loads(df.to_json(**TO_JSON_KWARGS))) except SchemaError as e: response = dict(status='ERROR', error=str(e)) except APINoResultError as e: response = dict(status='No result error', error=str(e)) except Exception as e: logger.exception(e) response = dict(status='ERROR', error='Server error, query failed') return flask.jsonify(response)
def look_up_top_articles(cls, session, upper_day, most_recent): """Look up top articles. Parameters ---------- session : object An instance of SQLAlchemy Session. upper_day : date The right edge of 30 days window. most_recent : bool If no result in 30 days windows with `upper_day`, whether return most recently available data. """ df = db_query_top_articles(session, upper_day, most_recent) if len(df) == 0: logger.warning('No result found!') cls.print_psql(df) else: logger.info('Top articles for day %s', upper_day) cls.print_psql(df)
def generate_top_articles(cls, session, upper_day): """Generate top articles. Parameters ---------- session : object An instance of SQLAlchemy Session. upper_day : Date The right edge of 30 days window. """ df = db_query_top_articles(session, upper_day, most_recent=False, exclude_tags=[]) if len(df) > 0: logger.warning('Top spreaders for upper_day %s, already exist!', upper_day) cls.print_psql(df) return month_delta = relativedelta(days=30) lower_day = upper_day - month_delta # top 20 most sharing articles for 'fact_checking' q1 = """ INSERT INTO top20_article_monthly (upper_day, date_captured, title, canonical_url, site_type, number_of_tweets) SELECT :upper_day AS upper_day, a.date_captured AS date_captured, a.title AS title, a.canonical_url AS canonical_url, 'fact_checking' AS site_type, COUNT(DISTINCT tw.id) AS number_of_tweets FROM article AS a JOIN url AS u ON u.article_id=a.id JOIN ass_tweet_url AS atu ON atu.url_id=u.id JOIN tweet AS tw ON tw.id=atu.tweet_id JOIN site AS s ON s.id=u.site_id WHERE a.date_captured BETWEEN :lower_day AND :upper_day AND tw.created_at BETWEEN :lower_day AND :upper_day AND s.site_type LIKE 'fact_checking' GROUP BY a.date_captured, a.title, a.canonical_url ORDER BY number_of_tweets DESC LIMIT 20 """ # top 20 most sharing articles for 'claim' q2 = """ INSERT INTO top20_article_monthly (upper_day, date_captured, title, canonical_url, site_type, number_of_tweets) SELECT :upper_day AS upper_day, a.date_captured AS date_captured, a.title AS title, a.canonical_url AS canonical_url, 'claim' AS site_type, COUNT(DISTINCT tw.id) AS number_of_tweets FROM article AS a JOIN url AS u ON u.article_id=a.id JOIN ass_tweet_url AS atu ON atu.url_id=u.id JOIN tweet AS tw ON tw.id=atu.tweet_id JOIN site AS s ON s.id=u.site_id WHERE a.date_captured BETWEEN :lower_day AND :upper_day AND tw.created_at BETWEEN :lower_day AND :upper_day AND s.site_type LIKE 'claim' GROUP BY a.date_captured, a.title, a.canonical_url ORDER BY number_of_tweets DESC LIMIT 20 """ session.execute( text(q1).bindparams(lower_day=lower_day, upper_day=upper_day)) session.execute( text(q2).bindparams(lower_day=lower_day, upper_day=upper_day)) try: session.commit() df = db_query_top_articles(session, upper_day) if len(df) == 0: logger.warning('No new top articles found!') cls.print_psql(df) else: logger.info('Insert new top articles for day %s', upper_day) cls.print_psql(df) except SQLAlchemyError as e: logger.error(e) session.rollback()