def calculate_reading_similarity_list(recid, type="pageviews"):
    """Calculate reading similarity data to use in reading similarity
       boxes (``people who downloaded/viewed this file/page have also
       downloaded/viewed'').  Return list of (recid1, score1),
       (recid2,score2), ... for all recidN that were consulted by the
       same people who have also consulted RECID.  The reading
       similarity TYPE can be either `pageviews' or `downloads',
       depending whether we want to obtain page view similarity or
       download similarity.
    """
    if type == "downloads":
        tablename = "rnkDOWNLOADS"
    else: # default
        tablename = "rnkPAGEVIEWS"
    # firstly compute the set of client hosts who consulted recid:
    client_host_list = run_sql("SELECT DISTINCT(client_host)" + \
                               "  FROM " + tablename + \
                               " WHERE id_bibrec=%s " + \
                               "   AND client_host IS NOT NULL",
                               (recid,))
    # secondly look up all recids that were consulted by these client hosts,
    # and order them by the number of different client hosts reading them:
    res = []
    if client_host_list != ():
        client_host_list = str(database_tuples_to_single_list(client_host_list))
        client_host_list = client_host_list.replace("L", "")
        client_host_list = client_host_list.replace("[", "")
        client_host_list = client_host_list.replace("]", "")
        res = run_sql("SELECT id_bibrec,COUNT(DISTINCT(client_host)) AS c" \
                      "  FROM " + tablename + \
                      " WHERE client_host IN (" + client_host_list + ")" + \
                      "   AND id_bibrec != %s" \
                      " GROUP BY id_bibrec ORDER BY c DESC LIMIT 10",
                      (recid,))
    return res
def calculate_reading_similarity_list(recid, type="pageviews"):
    """Calculate reading similarity data to use in reading similarity
       boxes (``people who downloaded/viewed this file/page have also
       downloaded/viewed'').  Return list of (recid1, score1),
       (recid2,score2), ... for all recidN that were consulted by the
       same people who have also consulted RECID.  The reading
       similarity TYPE can be either `pageviews' or `downloads',
       depending whether we want to obtain page view similarity or
       download similarity.
    """
    if CFG_CERN_SITE:
        return []  # CERN hack 2009-11-23 to ease the load
    if type == "downloads":
        tablename = "rnkDOWNLOADS"
    else:  # default
        tablename = "rnkPAGEVIEWS"
    # firstly compute the set of client hosts who consulted recid:
    client_host_list = run_sql("SELECT DISTINCT(client_host)" + \
                               "  FROM " + tablename + \
                               " WHERE id_bibrec=%s " + \
                               "   AND client_host IS NOT NULL",
                               (recid,))
    # secondly look up all recids that were consulted by these client hosts,
    # and order them by the number of different client hosts reading them:
    res = []
    if client_host_list != ():
        client_host_list = str(
            database_tuples_to_single_list(client_host_list))
        client_host_list = client_host_list.replace("L", "")
        client_host_list = client_host_list.replace("[", "")
        client_host_list = client_host_list.replace("]", "")
        res = run_sql("SELECT id_bibrec,COUNT(DISTINCT(client_host)) AS c" \
                      "  FROM " + tablename + \
                      " WHERE client_host IN (" + client_host_list + ")" + \
                      "   AND id_bibrec != %s" \
                      " GROUP BY id_bibrec ORDER BY c DESC LIMIT 10",
                      (recid,))
    return res
 def test_database_tuples_to_single_list(self):
     """bibrank downloads indexer - database tuples to list"""
     self.assertEqual([1, 2, 3], bibrank_downloads_indexer.database_tuples_to_single_list(((1,), (2,), (3,))))
def create_download_history_graph_and_box(id_bibrec, ln=CFG_SITE_LANG):
    """Create graph with citation history for record ID_BIBREC (into a
       temporary file) and return HTML box refering to that image.
       Called by Detailed record pages.
       Notes:
        if id_bibdoc=0 : its an oustide-stored document and it has no id_bibdoc --> only one line
        if len(id_bibdocs) <= cfg_id_bibdoc_id_bibrec draw one line per id_bibdoc
        if len(id_bibdocs) > cfg_id_bibdoc_id_bibrec draw only one line which hold simultaneously the downloads for all id_bibdoc
        Each time this function is called, all the images older than 10 minutes are deleted.
    """
    _ = gettext_set_language(ln)

    out = ""

    # Prepare downloads history graph:
    if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS:
        html_content = ""
        # remove images older than 10 minutes
        remove_old_img("download")
        # download count graph
        id_bibdocs = intbitset(run_sql("select distinct id_bibdoc from rnkDOWNLOADS where id_bibrec=%s", (id_bibrec, )))

        id_existing_bibdocs = intbitset(run_sql("SELECT id_bibdoc FROM bibrec_bibdoc JOIN bibdoc ON id_bibdoc=id WHERE id_bibrec=%s AND status<>'DELETED'", (id_bibrec, )))

        ## FIXME: when bibdocs are deleted we loose the stats. What shall we do with them?
        id_bibdocs &= id_existing_bibdocs

        history_analysis_results = ()
        if not id_bibdocs:
            pass
        elif len(id_bibdocs) <= cfg_id_bibdoc_id_bibrec and 0 not in id_bibdocs:
            history_analysis_results = draw_downloads_statistics(id_bibrec, list(id_bibdocs))
        else:
            history_analysis_results = draw_downloads_statistics(id_bibrec, [])
        if history_analysis_results and history_analysis_results[0]:
            if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS == 2:
                graph_file_history = CFG_WEBDIR + "/img/" + history_analysis_results[0]
                html_content += """<tr><td valign=center align=center>%s</td>""" % open(graph_file_history).read()
            else:#gnuplot
                graph_file_history = CFG_SITE_URL + "/img/" + history_analysis_results[0]
                html_content += """<tr><td valign=center align=center><img src='%s'/></td>""" % graph_file_history
            file_to_close_history = history_analysis_results[1]
            if file_to_close_history :
                if os.path.exists(file_to_close_history):
                    os.unlink(file_to_close_history)
        if html_content != "":
            out += """<br/><br/><table><tr><td class="blocknote">
                      %s</td></tr><tr><td>
                      <table border="0" cellspacing="1" cellpadding="1">""" % _("Download history:")
            out += html_content + "</table></td></tr></table>"

    if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION:
        # do we show also user IP repartition?
        html_content = ""
        remove_old_img("download")
        #Users analysis graph
        ips = database_tuples_to_single_list(run_sql("select client_host from rnkDOWNLOADS where id_bibrec=%s;" % id_bibrec))
        if ips:
            users_analysis_results = create_users_analysis_graph(id_bibrec, ips)
            if users_analysis_results[0]:
                #graph_file_users = CFG_SITE_URL + "/img/"  + users_analysis_results[0]
                graph_file_users = CFG_WEBDIR + "/img/"  + users_analysis_results[0]
                file_to_close_users = users_analysis_results[1]
                html_content += """<tr><td valign=center align=center>%s</td>""" % open(graph_file_users).read()
                if file_to_close_users:
                    if os.path.exists(file_to_close_users):
                        os.unlink(file_to_close_users)
        if html_content != "":
            out += """<br/><br/><table><tr><td class="blocknote">
                      %s</td></tr><tr><td>
                      <table border="0" cellspacing="1" cellpadding="1">""" %  _("Download user distribution:")
            out += html_content
            out += "</table></td></tr></table>"

    # return html code used by get_file or search_engine
    return out
Exemplo n.º 5
0
def create_download_history_graph_and_box(id_bibrec, ln=CFG_SITE_LANG):
    """Create graph with citation history for record ID_BIBREC (into a
       temporary file) and return HTML box refering to that image.
       Called by Detailed record pages.
       Notes:
        if id_bibdoc=0 : its an oustide-stored document and it has no id_bibdoc --> only one line
        if len(id_bibdocs) <= cfg_id_bibdoc_id_bibrec draw one line per id_bibdoc
        if len(id_bibdocs) > cfg_id_bibdoc_id_bibrec draw only one line which hold simultaneously the downloads for all id_bibdoc
        Each time this function is called, all the images older than 10 minutes are deleted.
    """
    _ = gettext_set_language(ln)

    out = ""

    # Prepare downloads history graph:
    if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS:
        html_content = ""
        # remove images older than 10 minutes
        remove_old_img("download")
        # download count graph
        id_bibdocs = intbitset(
            run_sql(
                "select distinct id_bibdoc from rnkDOWNLOADS where id_bibrec=%s",
                (id_bibrec, )))

        id_existing_bibdocs = intbitset(
            run_sql(
                "SELECT id_bibdoc FROM bibrec_bibdoc JOIN bibdoc ON id_bibdoc=id WHERE id_bibrec=%s AND status<>'DELETED'",
                (id_bibrec, )))

        ## FIXME: when bibdocs are deleted we loose the stats. What shall we do with them?
        id_bibdocs &= id_existing_bibdocs

        history_analysis_results = ()
        if not id_bibdocs:
            pass
        elif len(
                id_bibdocs) <= cfg_id_bibdoc_id_bibrec and 0 not in id_bibdocs:
            history_analysis_results = draw_downloads_statistics(
                id_bibrec, list(id_bibdocs))
        else:
            history_analysis_results = draw_downloads_statistics(id_bibrec, [])
        if history_analysis_results and history_analysis_results[0]:
            if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS == 2:
                graph_file_history = CFG_WEBDIR + "/img/" + history_analysis_results[
                    0]
                html_content += """<tr><td valign=center align=center>%s</td>""" % open(
                    graph_file_history).read()
            else:  #gnuplot
                graph_file_history = CFG_SITE_URL + "/img/" + history_analysis_results[
                    0]
                html_content += """<tr><td valign=center align=center><img src='%s'/></td>""" % graph_file_history
            file_to_close_history = history_analysis_results[1]
            if file_to_close_history:
                if os.path.exists(file_to_close_history):
                    os.unlink(file_to_close_history)
        if html_content != "":
            out += """<table border="0" cellspacing="1" cellpadding="1">"""
            out += html_content + "</table>"

    if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION:
        # do we show also user IP repartition?
        html_content = ""
        remove_old_img("download")
        #Users analysis graph
        ips = database_tuples_to_single_list(
            run_sql(
                "select client_host from rnkDOWNLOADS where id_bibrec=%s;" %
                id_bibrec))
        if ips:
            users_analysis_results = create_users_analysis_graph(
                id_bibrec, ips)
            if users_analysis_results[0]:
                file_to_close_users = users_analysis_results[1]
                if CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION == 1:
                    html_content += """<tr><td valign=center align=center><img src='%s/img/%s' align="center" alt=""></td>""" % (
                        CFG_SITE_URL, users_analysis_results[0])
                elif CFG_BIBRANK_SHOW_DOWNLOAD_GRAPHS_CLIENT_IP_DISTRIBUTION == 2:
                    html_content += """<tr><td valign=center align=center>%s</td>""" % open(
                        CFG_WEBDIR + "/img/" +
                        users_analysis_results[0]).read()
                if file_to_close_users:
                    if os.path.exists(file_to_close_users):
                        os.unlink(file_to_close_users)
        if html_content != "":
            out += """<br/><br/><table><tr><td class="blocknote">
                      %s</td></tr><tr><td>
                      <table border="0" cellspacing="1" cellpadding="1">""" % _(
                "Download user distribution:")
            out += html_content
            out += "</table></td></tr></table>"

    # return html code used by get_file or search_engine
    return out
def calculate_reading_similarity_list(recid, type="pageviews"):
    """Calculate reading similarity data to use in reading similarity
       boxes (``people who downloaded/viewed this file/page have also
       downloaded/viewed'').  Return list of (recid1, score1),
       (recid2,score2), ... for all recidN that were consulted by the
       same people who have also consulted RECID.  The reading
       similarity TYPE can be either `pageviews' or `downloads',
       depending whether we want to obtain page view similarity or
       download similarity.
    """
    if CFG_CERN_SITE:
        return [] # CERN hack 2009-11-23 to ease the load
    if type == "downloads":
        tablename = "rnkDOWNLOADS"
    else: # default
        tablename = "rnkPAGEVIEWS"

    # firstly compute the set of client hosts who consulted recid:
    client_host_list = run_sql("SELECT DISTINCT(client_host)" + \
                               " FROM " + tablename + \
                               " WHERE id_bibrec=%s " + \
                               " AND client_host IS NOT NULL",
                               (recid,))

    # BF: let's display blogs that were read by people who
    # also read the current recid parent blog
    from invenio.webblog_utils import get_parent_blog
    parent_blog = get_parent_blog(recid)
    res = []
    if parent_blog:
        if client_host_list != ():
            client_host_list = str(database_tuples_to_single_list(client_host_list))
            client_host_list = client_host_list.replace("L", "")
            client_host_list = client_host_list.replace("[", "")
            client_host_list = client_host_list.replace("]", "")
    
            res = run_sql("SELECT CAST(b.value AS UNSIGNED), COUNT(DISTINCT(client_host)) AS c" \
                      "  FROM rnkPAGEVIEWS v, bibrec_bib76x bb, bib76x b WHERE client_host IN (" + client_host_list + ")" + \
                      "   AND v.id_bibrec != %s" \
                      "   AND v.id_bibrec = bb.id_bibrec" \
                      "   AND bb.id_bibxxx = b.id" \
                      "   AND b.tag = '760__w'" \
                      "   AND b.value != %s" \
                      " GROUP BY b.value ORDER BY c",
                      (recid, parent_blog))

        # secondly look up all recids that were consulted by these client hosts,
        # and order them by the number of different client hosts reading them:
#        res = run_sql("SELECT id_bibrec,COUNT(DISTINCT(client_host)) AS c" \
#                      "  FROM " + tablename + \
#                      " WHERE client_host IN (" + client_host_list + ")" + \
#                      "   AND id_bibrec != %s" \
#                      " GROUP BY id_bibrec ORDER BY c DESC LIMIT 10",
#                      (recid,))

    #BF: let's group the records by blog collection
#    results = {}
#    res = list(res)
#    if get_fieldvalues(recid, '980__a')[0] == 'BLOG':
#        blog_descendants = get_blog_descendants(recid)
#        for row in res:
#            recid = row[0]
#            # recid is a post, comment or page from a different BLOG
#            if recid not in blog_descendants:
#                # let's get the blog to which belongs each record
#                parent_blog = get_parent_blog(recid)
#                if parent_blog not in results:
#                    results.update([(parent_blog, row[1])])
#                else:
#                    results.update([(parent_blog, results[parent_blog] + 1)])
#
#        res = results.items()

    return res