예제 #1
0
def top_year(year, connection):
  resp = connection.read("""
  SELECT SUM(t.pdf) as downloads, t.article, a.url,
    a.title, a.abstract, a.collection, a.posted, a.doi
    FROM article_traffic t
    INNER JOIN articles a ON t.article=a.id
    WHERE t.year = %s
      AND a.posted >= '%s-01-01'
      AND a.posted <= '%s-12-31'
    GROUP BY 2,3,4,5,6,7,8
    ORDER BY 1 DESC
    LIMIT 25
  """, (year,year,year))
  if len(resp) == 0:
    return []
  results = [models.SearchResultArticle(a, connection) for a in resp]
  return results
예제 #2
0
def paper_query(q, categories, timeframe, metric, page, page_size, connection):
    """Returns a list of the most downloaded papers that meet a given set of constraints.

  Arguments:
    - connection: a database Connection object.
    - q:  A search string to compare against article abstracts,
          titles and author names. (Title matches are weighted more heavily.)
    - categories: A list of bioRxiv categories the results can be in.
    - timeframe: A description of the range of dates on which to
          base the rankings (i.e. "alltime" or "lastmonth")
    - metric: Which article-level statistic to use when sorting results
    - page: Which page of the results to display (0-indexed)
    - page_size: How many entries should be returned
  Returns:
    - An list of Article objects that meet the search criteria, sorted by the
          specified metric in descending order.

  """
    # We build two queries, 'select' and 'countselect': one to get the
    # current page of results, and one to figure out the total number
    # of results
    select = "SELECT "
    if metric == "downloads":
        select += "r.downloads"
    elif metric == "twitter":
        select += "SUM(r.count)"
    select += ", a.id, a.url, a.title, a.abstract, a.collection, a.posted, a.doi"

    countselect = "SELECT COUNT(DISTINCT a.id)"
    params = ()

    query = ""
    if q != "":  # if there's a text search specified
        params = (q, )
    query += f' FROM {config.db["schema"]}.articles AS a INNER JOIN {config.db["schema"]}.'
    if metric == "twitter":
        query += "crossref_daily"
    elif metric == "downloads":
        query_times = {
            "alltime": "alltime_ranks",
            "ytd": "ytd_ranks",
            "lastmonth": "month_ranks",
        }
        query += query_times[timeframe]

    if metric == "twitter":
        query += " AS r ON r.doi=a.doi"
    elif metric == "downloads":
        query += " AS r ON r.article=a.id"

    if q != "":
        query += """, plainto_tsquery(%s) query,
    coalesce(setweight(a.title_vector, 'A') || setweight(a.abstract_vector, 'C') || setweight(a.author_vector, 'D')) totalvector
    """
    # add a WHERE clause if we need one:
    # (all-time twitter stats don't require it)
    if metric == "downloads" or (metric == "twitter" and timeframe != "alltime"
                                 ) or len(categories) > 0:
        query += " WHERE "
        if metric == "downloads":
            query += "r.downloads > 0"
            if q != "" or len(categories) > 0:
                query += " AND "
        if q != "":
            query += "query @@ totalvector "
            if len(categories) > 0 or (metric == "twitter"
                                       and timeframe != "alltime"):
                query += " AND "

        if len(categories) > 0:
            query += "collection=ANY(%s)"
            if q != "":
                params = (q, categories)
            else:
                params = (categories, )
            if metric == "twitter" and timeframe != "alltime":
                query += " AND "
        if metric == "twitter" and timeframe != "alltime":
            query += "r.source_date > now() - interval "
            query_times = {"day": 2, "week": 7, "month": 30, "year": 365}
            query += f"'{query_times[timeframe]} days' "
    # this is the last piece of the query we need for the one
    # that counts the total number of results
    countselect += query
    resp = connection.read(countselect, params)
    total = resp[0][0]
    # continue building the query to get the full list of results:
    if metric == "twitter":
        query += " GROUP BY a.id"
    query += " ORDER BY "
    if metric == "downloads":
        query += "r.rank ASC"
    elif metric == "twitter":
        query += "SUM(r.count) DESC"

    query += f" LIMIT {page_size}"
    if page > 0:
        query += f" OFFSET {page * page_size}"
    query += ";"
    select += query
    result = connection.read(select, params)
    results = [models.SearchResultArticle(a, connection) for a in result]
    return results, total