Exemplo n.º 1
0
def format_element(bfo, default='', separator='; ', style='',
                   show_icons='no', prefix='', suffix=''):
    """ Creates html of links based on metadata
    @param separator (separates instances of links)
    @param prefix
    @param suffix
    @param show_icons default = no
    @param style options CSS style for link
    """
    _ = gettext_set_language(bfo.lang)
    if style != "":
        style = ' class="' + style + '"'

    links = []

    # ADS/CDS/KEKSCAN/HAL links
    # external identifiers in tag 035__a along with service label in 035__9
    identifiers = bfo.fields('035__')
    adslinked = False
    for ident in identifiers:
        provenance = ident.get('9', None)
        extid = ident.get('a', None)
        if provenance is None or extid is None:
            continue

        if provenance == 'KEKSCAN':
            extid = extid.replace("-", "")
            links.append('<a%s href="%s%s"> KEK scanned document</a>' %
                         (style, KEK, extid))
        elif provenance == 'CDS':
            links.append('<a%s href="%s%s"> CERN Document Server</a>' %
                         (style, CDS, extid))
        elif provenance == 'ADS':
            extid = extid.replace('&', '%26')  # A&A etc.
            links.append('<a%s href="%s%s"> ADS Abstract Service</a>' %
                         (style, ADSABS, extid))
            adslinked = True
        elif provenance == 'HAL':
            from invenio.webuser import isUserAdmin
            if isUserAdmin(bfo.user_info):
                links.append('<a %s href="%s%s"> HAL Archives Ouvertes</a>' %
                             (style, HAL, extid))

    # fallback ADS link via arXiv:e-print
    if not adslinked:
        identifiers = bfo.fields('037__')
        eprints = set()  # avoid duplicate links
        for ident in identifiers:
            if ident.get('9', '') == 'arXiv' \
               and ident.get('a', None) is not None:
                eprints.add(ident.get('a', ''))
        if eprints:
            adslinked = True
            for eprint in eprints:
                links.append('<a href="%s%s"> ADS Abstract Service</a>'
                             % (ADSABS, eprint))

    # external identifiers in tag 035__a along with service label in 035__9
    urls = bfo.fields('035__')
    for url in urls:
        provenance = url.get('9', None)
        extid = url.get('a', None)
        if provenance is None or extid is None:
            continue

        provenance = provenance.lower()
        if provenance == "msnet":
            links.append('<a%s href="%s%s"> AMS MathSciNet</a>' %
                         (style, MSNET, extid))
        elif provenance == "zblatt":
            links.append('<a%s href="%s%s"> zbMATH</a>' %
                         (style, ZBLATT, extid))
        elif provenance == "euclid":
            links.append('<a%s href="%s%s"> Project Euclid</a>' %
                         (style, EUCLID, extid))
        elif provenance == "osti":
            links.append('<a%s href="%s%s"> OSTI Information Bridge Server</a>' %
                         (style, OSTI, extid))

    # now look for explicit URLs
    # might want to check that we aren't repeating things from above...
    # Note: excluding self-links
    urls = bfo.fields('8564_')
    allowed_doctypes = ["INSPIRE-PUBLIC", "SCOAP3", "PoS"]
    for url in urls:
        if url.get("y", "").lower() not in \
           ("adsabs", "euclid", "msnet", "osti", "zblatt"):
            if '.png' not in url.get('u', '') and not (
                    url.get('y', '').lower().startswith("fermilab") and
                    bfo.field("710__g").lower() in
                    ('atlas collaboration', 'cms collaboration')):
                if url.get('y', '').upper() != "DURHAM":
                    if url.get("u", '') and \
                       url.get('y', 'Fulltext').upper() != "DOI" and not \
                       url.get('u', '').startswith(CFG_SITE_URL):
                        links.append('<a %s href="%s">%s</a>' %
                                     (style, url.get("u", ''),
                                      _lookup_url_name(bfo, url.get(
                                          'y', 'Fulltext'))))
                    elif url.get("u", '').startswith(CFG_SITE_URL) and \
                        (url.get("u", '').lower().endswith(".pdf") or
                         url.get("u", '').lower().endswith(
                             '.pdf?subformat=pdfa')) and \
                            bibdocfile_url_to_bibdoc(url.get('u')).doctype in \
                            allowed_doctypes:
                        links.append('<a %s href="%s">%s</a>' %
                                     (style, url.get("u", ''),
                                      _lookup_url_name(bfo, url.get(
                                          'y', 'Fulltext'))))

    # put it all together
    if links:
        if show_icons.lower() == 'yes':
            img = '<img style="border:none" \
            src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \
            % (CFG_BASE_URL, _("Download fulltext"))
            links = [img + '<small>' + link + '</small>' for link in links]
        return prefix + separator.join(links) + suffix
    else:
        return default
Exemplo n.º 2
0
def format_element(bfo, default='', separator='; ', style='', \
                   show_icons='no', prefix='', suffix=''):
    """ Creates html of links based on metadata
    @param separator (separates instances of links)
    @param prefix
    @param suffix
    @param show_icons default = no
    @param style options CSS style for link
    """
    _ = gettext_set_language(bfo.lang)
    if style != "":
        style = 'class = "' + style + '"'

    links = []

    # KEKSCAN/CDS links
    identifiers = bfo.fields('035__')

    for ident in identifiers:
        if ident.get('9', '') == 'KEKSCAN' and ident.get('a',
                                                         None) is not None:
            out = ident['a'].replace("-", "")
            links.append('<a href="http://www-lib.kek.jp/cgi-bin/img_index?' +
                         out + '"> KEK scanned document </a>')

        if ident.get('9', '') == 'CDS' and ident.get('a', None) is not None:
            links.append('<a href="http://cds.cern.ch/record/' + ident['a'] +
                         '"> CERN Document Server </a>')

    # ADS links
    identifiers = bfo.fields('037__')
    current_links = bfo.field('8564_y')

    for ident in identifiers:
        if ident.get('9',
                     '') == 'arXiv' and not ("ADSABS"
                                             in current_links) and ident.get(
                                                 'a', None) is not None:
            links.append(
                '<a href="http://adsabs.harvard.edu/cgi-bin/basic_connect?qsearch='
                + ident.get('a', '') + '">ADS Abstract Service</a>')

    #links moved to new field 035
    urls = bfo.fields('035__')
    allowed_doctypes = ["INSPIRE-PUBLIC"]
    for url in urls:
        if "9" in url and "a" in url:
            if url["9"].lower() == "msnet":
                links.append(
                    '<a ' + style +
                    ' href="http://www.ams.org/mathscinet-getitem?mr=' +
                    url["a"] + '">AMS MathSciNet</a>')
            if url["9"].lower() == "zblatt":
                links.append(
                    '<a ' + style +
                    ' href="http://www.zentralblatt-math.org/zmath/en/search/?an='
                    + url["a"] + '">zbMATH</a>')
            if url["9"].lower() == "euclid":
                links.append('<a ' + style +
                             ' href="http://projecteuclid.org/euclid.cmp/=' +
                             url["a"] + '">Project Euclid</a>')

    # now look for explicit URLs
    # might want to check that we aren't repeating things from above...
    # Note: excluding self-links
    urls = bfo.fields('8564_')
    allowed_doctypes = ["INSPIRE-PUBLIC"]
    for url in urls:
        if url.get("y", "").lower() not in ("msnet", "zblatt", "euclid"):
            if '.png' not in url['u'] and not \
            (url.get('y', '').lower().startswith("fermilab") and bfo.field("710__g").lower() in ('atlas collaboration', 'cms collaboration')):
                if url.get('y', '').upper() != "DURHAM":
                    if url.get("u") and \
                    url.get('y', 'Fulltext').upper() != "DOI" and not \
                    url.get('u').startswith(CFG_SITE_URL):
                        links.append('<a ' + style + \
                        'href="' + url.get("u") + '">' + \
                              _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>')
                    elif url.get("u").startswith(CFG_SITE_URL) and \
                    url.get("u")[-3:].lower() == "pdf" and bibdocfile_url_to_bibdoc(url.get('u')).doctype in allowed_doctypes:
                        links.append('<a ' + style + 'href="' + url.get("u") + '">' + \
                        _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>')

    #put it all together
    if links:
        if show_icons.lower() == 'yes':
            img = '<img style="border:none" \
            src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \
            % (CFG_BASE_URL, _("Download fulltext"))
            links = [img + '<small>' + link + '</small>' for link in links]
        return prefix + separator.join(links) + suffix
    else:
        return default
Exemplo n.º 3
0
def format_element(bfo, default='', separator='; ', style='', \
                   show_icons='no', prefix='', suffix=''):
    """ Creates html of links based on metadata
    @param separator (separates instances of links)
    @param prefix
    @param suffix
    @param show_icons default = no
    @param style options CSS style for link
    """
    _ = gettext_set_language(bfo.lang)
    if style != "":
        style = 'class = "' + style + '"'

    links = []

    # KEKSCAN links
    identifiers = bfo.fields('035__')

    for ident in identifiers:
        if ident['9'] == 'KEKSCAN':
            out = ident['a'].replace("-", "")
            links.append('<a href="http://www-lib.kek.jp/cgi-bin/img_index?' + out + '"> KEK scanned document </a>')

    # CDS links
    for ident in identifiers:
        if ident['9'] == 'CDS':
            links.append('<a href="http://cds.cern.ch/record/' + ident['a'] + '"> CERN Document Server </a>')

    # ADS links
    identifiers = bfo.fields('037__')
    current_links = bfo.field('8564_y')

    for ident in identifiers:
        if ident.get('9', '') == 'arXiv' and not ("ADSABS" in current_links):
            links.append('<a href="http://adsabs.harvard.edu/cgi-bin/basic_connect?qsearch=' + ident.get('a', '') + '">ADS Abstract Service</a>')

    # could look for other publication info and calculate URls here

    #links moved to new field 035
    urls = bfo.fields('035__')
    allowed_doctypes = ["INSPIRE-PUBLIC"]
    for url in urls:
        if "9" in url:
            if url["9"].lower() == "msnet":
                links.append('<a ' + style + ' href="http://www.ams.org/mathscinet-getitem?mr=' + url["a"] + '">AMS MathSciNet</a>')
            if url["9"].lower() == "zblatt":
                links.append('<a ' + style + ' href="http://www.zentralblatt-math.org/zmath/en/search/?an=' + url["a"] + '">zbMATH</a>')
            if url["9"].lower() == "euclid":
                links.append('<a ' + style + ' href="http://projecteuclid.org/euclid.cmp/=' + url["a"] + '">Project Euclid</a>')

    # now look for explicit URLs
    # might want to check that we aren't repeating things from above...
    # Note: excluding self-links
    urls = bfo.fields('8564_')
    allowed_doctypes = ["INSPIRE-PUBLIC"]
    for url in urls:
        if url.get("y", "").lower() not in ("msnet", "zblatt", "euclid"):
            if '.png' not in url['u'] and not \
            (url.get('y', '').lower().startswith("fermilab") and bfo.field("710__g").lower() in ('atlas collaboration', 'cms collaboration')):
                if url.get('y', '').upper() != "DURHAM":
                    if url.get("u") and \
                    url.get('y', 'Fulltext').upper() != "DOI" and not \
                    url.get('u').startswith(CFG_SITE_URL):
                        links.append('<a ' + style + \
                        'href="' + url.get("u") + '">' + \
                              _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>')
                    elif url.get("u").startswith(CFG_SITE_URL) and \
                    url.get("u")[-3:].lower() == "pdf" and bibdocfile_url_to_bibdoc(url.get('u')).doctype in allowed_doctypes:
                        links.append('<a ' + style + 'href="' + url.get("u") + '">' + \
                        _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>')

    #put it all together
    if links:
        if show_icons.lower() == 'yes':
            img = '<img style="border:none" \
            src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \
            % (CFG_BASE_URL, _("Download fulltext"))
            links = [img + '<small>' + link + '</small>' for link in links]
        return prefix + separator.join(links) + suffix
    else:
        return default
    def get_words_from_fulltext(self, url_direct_or_indirect):
        """Returns all the words contained in the document specified by
           URL_DIRECT_OR_INDIRECT with the words being split by various
           SRE_SEPARATORS regexp set earlier.  If FORCE_FILE_EXTENSION is
           set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
           file.  (This is interesting to index Indico for example.)  Note
           also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
           the fulltext file or an URL to a setlink-like page body that
           presents the links to be indexed.  In the latter case the
           URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
           to fulltext documents, for all knows file extensions as
           specified by global CONV_PROGRAMS config variable.
        """
        write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2)
        try:
            if bibdocfile_url_p(url_direct_or_indirect):
                write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2)
                try:
                    bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect)
                except InvenioBibDocFileError:
                    # Outdated 8564 tag
                    return []
                indexer = get_idx_indexer('fulltext')
                if indexer != 'native':
                    # A document might belong to multiple records
                    for rec_link in bibdoc.bibrec_links:
                        recid = rec_link["recid"]
                        # Adds fulltexts of all files once per records
                        if not recid in fulltext_added:
                            bibrecdocs = BibRecDocs(recid)
                            try:
                                text = bibrecdocs.get_text()
                            except InvenioBibDocFileError:
                                # Invalid PDF
                                continue
                            if indexer == 'SOLR' and CFG_SOLR_URL:
                                solr_add_fulltext(recid, text)
                            elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                xapian_add(recid, 'fulltext', text)

                        fulltext_added.add(recid)
                    # we are relying on an external information retrieval system
                    # to provide full-text indexing, so dispatch text to it and
                    # return nothing here:
                    return []
                else:
                    text = ""
                    if hasattr(bibdoc, "get_text"):
                        text = bibdoc.get_text()
                    return self.tokenize_for_words_default(text)
            else:
                if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY:
                    write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2)
                    return []
                write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2)
                urls_to_index = set()
                for splash_re, url_re in CFG_BIBINDEX_SPLASH_PAGES.iteritems():
                    if re.match(splash_re, url_direct_or_indirect):
                        write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2)
                        html = urllib2.urlopen(url_direct_or_indirect).read()
                        urls = get_links_in_html_page(html)
                        write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3)
                        for url in urls:
                            if re.match(url_re, url):
                                write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2)
                                urls_to_index.add(url)
                if not urls_to_index:
                    urls_to_index.add(url_direct_or_indirect)
                write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2)
                words = {}
                for url in urls_to_index:
                    tmpdoc = download_url(url)
                    file_converter_logger = get_file_converter_logger()
                    old_logging_level = file_converter_logger.getEffectiveLevel()
                    if self.verbose > 3:
                        file_converter_logger.setLevel(logging.DEBUG)
                    try:
                        try:
                            tmptext = convert_file(tmpdoc, output_format='.txt')
                            text = open(tmptext).read()
                            os.remove(tmptext)

                            indexer = get_idx_indexer('fulltext')
                            if indexer != 'native':
                                if indexer == 'SOLR' and CFG_SOLR_URL:
                                    solr_add_fulltext(None, text) # FIXME: use real record ID
                                if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                    #xapian_add(None, 'fulltext', text) # FIXME: use real record ID
                                    pass
                                # we are relying on an external information retrieval system
                                # to provide full-text indexing, so dispatch text to it and
                                # return nothing here:
                                tmpwords = []
                            else:
                                tmpwords = self.tokenize_for_words_default(text)
                            words.update(dict(map(lambda x: (x, 1), tmpwords)))
                        except Exception, e:
                            message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e)
                            register_exception(prefix=message, alert_admin=True)
                            write_message(message, stream=sys.stderr)
                    finally:
                        os.remove(tmpdoc)
                        if self.verbose > 3:
                            file_converter_logger.setLevel(old_logging_level)
                return words.keys()
        except Exception, e:
            message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e)
            register_exception(prefix=message, alert_admin=True)
            write_message(message, stream=sys.stderr)
            return []
Exemplo n.º 5
0
def format_element(bfo, default="", separator="; ", style="", show_icons="no", prefix="", suffix=""):
    """ Creates html of links based on metadata
    @param separator (separates instances of links)
    @param prefix
    @param suffix
    @param show_icons default = no
    @param style options CSS style for link
    """
    _ = gettext_set_language(bfo.lang)
    if style != "":
        style = 'class = "' + style + '"'

    links = []

    from invenio.bibformat_elements.bfe_INSPIRE_arxiv import format_element as arxiv

    # if show_icons.lower() == 'yes':
    #    mirrors = "no"
    # else:
    #    mirrors = "yes"
    arxiv_links = arxiv(bfo, links="yes", mirrors="no")
    if arxiv_links:
        links.append(arxiv_links)

    journals = bfo.fields("773")
    # trivially take care of dois
    for journal in journals:
        journtitle = ""
        oa_type = bfo.kb("OALINKS", journal.get("n"), "").lower()
        if oa_type:
            final_style = style + ' class = "' + oa_type + '"'
        else:
            final_style = style
        if journal.get("a"):
            if journal.get("p"):
                journtitle = " - " + journal.get("p")
            links.append(
                "<a "
                + final_style
                + 'href="http://dx.doi.org/'
                + journal.get("a")
                + '">Journal Server</a>'
                + journtitle
            )

    # KEKSCAN links
    identifiers = bfo.fields("035__")

    for ident in identifiers:
        if ident["9"] == "KEKSCAN":
            out = ident["a"].replace("-", "")
            links.append('<a href="http://www-lib.kek.jp/cgi-bin/img_index?' + out + '"> KEK scanned document </a>')

    # CDS links
    identifiers = bfo.fields("035__")

    for ident in identifiers:
        if ident["9"] == "CDS":
            links.append('<a href="http://cds.cern.ch/record/' + ident["a"] + '"> CERN Document Server </a>')

    # could look for other publication info and calculate URls here

    # now look for explicit URLs
    # might want to check that we aren't repeating things from above...
    # Note: excluding self-links
    urls = bfo.fields("8564_")
    allowed_doctypes = ["INSPIRE-PUBLIC"]
    for url in urls:
        if url.get("u") and url.get("y", "Fulltext").upper() != "DOI" and not url.get("u").startswith(CFG_SITE_URL):
            links.append(
                "<a "
                + style
                + 'href="'
                + url.get("u")
                + '">'
                + _lookup_url_name(bfo, url.get("y", "Fulltext"))
                + "</a>"
            )
        elif (
            url.get("u").startswith(CFG_SITE_URL)
            and bibdocfile_url_to_bibdoc(url.get("u")).doctype in allowed_doctypes
            and url.get("u")[-3:].lower() == "pdf"
        ):
            links.append(
                "<a "
                + style
                + 'href="'
                + url.get("u")
                + '">'
                + _lookup_url_name(bfo, url.get("y", "Fulltext"))
                + "</a>"
            )

    # put it all together
    if links:
        if show_icons.lower() == "yes":
            img = (
                '<img style="border:none" \
            src="%s/img/file-icon-text-12x16.gif" alt="%s"/>'
                % (CFG_SITE_URL, _("Download fulltext"))
            )
            links = [img + "<small>" + link + "</small>" for link in links]
        return prefix + separator.join(links) + suffix
    else:
        return default
Exemplo n.º 6
0
def format_element(bfo,
                   default='',
                   separator='; ',
                   style='',
                   show_icons='no',
                   prefix='',
                   suffix=''):
    """ Creates html of links based on metadata
    @param separator (separates instances of links)
    @param prefix
    @param suffix
    @param show_icons default = no
    @param style options CSS style for link
    """
    _ = gettext_set_language(bfo.lang)
    if style != "":
        style = ' class="' + style + '"'

    links = []

    # ADS/CDS/KEKSCAN/INIS/HAL links
    # external identifiers in tag 035__a along with service label in 035__9
    identifiers = bfo.fields('035__')
    adslinked = False
    for ident in identifiers:
        provenance = ident.get('9', None)
        extid = ident.get('a', None)
        if provenance is None or extid is None:
            continue

        if provenance == 'KEKSCAN':
            extid = extid.replace("-", "")
            if len(extid) == 7 and not extid.startswith(
                    '19') and not extid.startswith('20'):
                year = '19' + extid[:2]
            elif len(extid) == 9:
                year = extid[:4]
                extid = extid[2:]
            else:
                # likely bad id
                continue
            yymm = extid[:4]
            links.append(
                '<a%s href="%s/%s/%s/%s.pdf"> KEK scanned document</a>' %
                (style, KEK, year, yymm, extid))
        elif provenance == 'CDS':
            links.append('<a%s href="%s%s"> CERN Document Server</a>' %
                         (style, CDS, extid))
        elif provenance == 'ADS':
            extid = extid.replace('&', '%26')  # A&A etc.
            links.append('<a%s href="%s%s"> ADS Abstract Service</a>' %
                         (style, ADSABS, extid))
            adslinked = True
        elif provenance == 'INIS':
            links.append('<a%s href="%s%s"> INIS Repository</a>' %
                         (style, INIS, extid))
        elif provenance == 'HAL':
            from invenio.webuser import isUserAdmin
            if isUserAdmin(bfo.user_info):
                links.append('<a %s href="%s%s"> HAL Archives Ouvertes</a>' %
                             (style, HAL, extid))

    # fallback ADS link via arXiv:e-print
    if not adslinked:
        identifiers = bfo.fields('037__')
        eprints = set()  # avoid duplicate links
        for ident in identifiers:
            if ident.get('9', '') == 'arXiv' \
               and ident.get('a', None) is not None:
                eprints.add(ident.get('a', ''))
        if eprints:
            adslinked = True
            for eprint in eprints:
                links.append('<a href="%s%s"> ADS Abstract Service</a>' %
                             (ADSABS, eprint))

    # external identifiers in tag 035__a along with service label in 035__9
    urls = bfo.fields('035__')
    for url in urls:
        provenance = url.get('9', None)
        extid = url.get('a', None)
        if provenance is None or extid is None:
            continue

        provenance = provenance.lower()
        if provenance == "msnet":
            links.append('<a%s href="%s%s"> AMS MathSciNet</a>' %
                         (style, MSNET, extid))
        elif provenance == "zblatt":
            links.append('<a%s href="%s%s"> zbMATH</a>' %
                         (style, ZBLATT, extid))
        elif provenance == "euclid":
            links.append('<a%s href="%s%s"> Project Euclid</a>' %
                         (style, EUCLID, extid))
        elif provenance == "osti":
            links.append(
                '<a%s href="%s%s"> OSTI Information Bridge Server</a>' %
                (style, OSTI, extid))

    # now look for explicit URLs
    # might want to check that we aren't repeating things from above...
    # Note: excluding self-links
    urls = bfo.fields('8564_')
    allowed_doctypes = ["INSPIRE-PUBLIC", "SCOAP3", "PoS"]
    for url in urls:
        if url.get("y", "").lower() not in \
           ("adsabs", "euclid", "msnet", "osti", "zblatt"):
            if '.png' not in url.get('u', '') and not (
                    url.get('y', '').lower().startswith("fermilab")
                    and bfo.field("710__g").lower() in
                ('atlas collaboration', 'cms collaboration')):
                if url.get('y', '').upper() != "DURHAM":
                    if url.get("u", '') and \
                       url.get('y', 'Fulltext').upper() != "DOI" and not \
                       url.get('u', '').startswith(CFG_SITE_URL):
                        links.append(
                            '<a %s href="%s">%s</a>' %
                            (style, url.get("u", ''),
                             _lookup_url_name(bfo, url.get('y', 'Fulltext'))))
                    elif url.get("u", '').startswith(CFG_SITE_URL) and \
                        (url.get("u", '').lower().endswith(".pdf") or
                         url.get("u", '').lower().endswith(
                             '.pdf?subformat=pdfa')) and \
                            bibdocfile_url_to_bibdoc(url.get('u')).doctype in \
                            allowed_doctypes:
                        links.append(
                            '<a %s href="%s">%s</a>' %
                            (style, url.get("u", ''),
                             _lookup_url_name(bfo, url.get('y', 'Fulltext'))))

    # put it all together
    if links:
        if show_icons.lower() == 'yes':
            img = '<img style="border:none" \
            src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \
            % (CFG_BASE_URL, _("Download fulltext"))
            links = [img + '<small>' + link + '</small>' for link in links]
        return prefix + separator.join(links) + suffix
    else:
        return default
    def get_words_from_fulltext(self, url_direct_or_indirect):
        """Returns all the words contained in the document specified by
           URL_DIRECT_OR_INDIRECT with the words being split by various
           SRE_SEPARATORS regexp set earlier.  If FORCE_FILE_EXTENSION is
           set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF
           file.  (This is interesting to index Indico for example.)  Note
           also that URL_DIRECT_OR_INDIRECT may be either a direct URL to
           the fulltext file or an URL to a setlink-like page body that
           presents the links to be indexed.  In the latter case the
           URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs
           to fulltext documents, for all knows file extensions as
           specified by global CONV_PROGRAMS config variable.
        """
        write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2)
        try:
            if bibdocfile_url_p(url_direct_or_indirect):
                write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2)
                try:
                    bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect)
                except InvenioBibDocFileError:
                    # Outdated 8564 tag
                    return []
                indexer = get_idx_indexer('fulltext')
                if indexer != 'native':
                    # A document might belong to multiple records
                    for rec_link in bibdoc.bibrec_links:
                        recid = rec_link["recid"]
                        # Adds fulltexts of all files once per records
                        if not recid in fulltext_added:
                            bibrecdocs = BibRecDocs(recid)
                            try:
                                text = bibrecdocs.get_text()
                            except InvenioBibDocFileError:
                                # Invalid PDF
                                continue
                            if indexer == 'SOLR' and CFG_SOLR_URL:
                                solr_add_fulltext(recid, text)
                            elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                xapian_add(recid, 'fulltext', text)

                        fulltext_added.add(recid)
                    # we are relying on an external information retrieval system
                    # to provide full-text indexing, so dispatch text to it and
                    # return nothing here:
                    return []
                else:
                    text = ""
                    if hasattr(bibdoc, "get_text"):
                        text = bibdoc.get_text()
                    return self.tokenize_for_words_default(text)
            else:
                if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY:
                    write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2)
                    return []
                write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2)
                urls_to_index = set()
                for splash_re, url_re in CFG_BIBINDEX_SPLASH_PAGES.iteritems():
                    if re.match(splash_re, url_direct_or_indirect):
                        write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2)
                        html = urllib2.urlopen(url_direct_or_indirect).read()
                        urls = get_links_in_html_page(html)
                        write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3)
                        for url in urls:
                            if re.match(url_re, url):
                                write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2)
                                urls_to_index.add(url)
                if not urls_to_index:
                    urls_to_index.add(url_direct_or_indirect)
                write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2)
                words = {}
                for url in urls_to_index:
                    tmpdoc = download_url(url)
                    file_converter_logger = get_file_converter_logger()
                    old_logging_level = file_converter_logger.getEffectiveLevel()
                    if self.verbose > 3:
                        file_converter_logger.setLevel(logging.DEBUG)
                    try:
                        try:
                            tmptext = convert_file(tmpdoc, output_format='.txt')
                            text = open(tmptext).read()
                            os.remove(tmptext)

                            indexer = get_idx_indexer('fulltext')
                            if indexer != 'native':
                                if indexer == 'SOLR' and CFG_SOLR_URL:
                                    solr_add_fulltext(None, text) # FIXME: use real record ID
                                if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED:
                                    #xapian_add(None, 'fulltext', text) # FIXME: use real record ID
                                    pass
                                # we are relying on an external information retrieval system
                                # to provide full-text indexing, so dispatch text to it and
                                # return nothing here:
                                tmpwords = []
                            else:
                                tmpwords = self.tokenize_for_words_default(text)
                            words.update(dict(map(lambda x: (x, 1), tmpwords)))
                        except Exception, e:
                            message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e)
                            register_exception(prefix=message, alert_admin=True)
                            write_message(message, stream=sys.stderr)
                    finally:
                        os.remove(tmpdoc)
                        if self.verbose > 3:
                            file_converter_logger.setLevel(old_logging_level)
                return words.keys()
        except Exception, e:
            message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e)
            register_exception(prefix=message, alert_admin=True)
            write_message(message, stream=sys.stderr)
            return []
Exemplo n.º 8
0
def format_element(bfo, default="", separator="; ", style="", show_icons="no", prefix="", suffix=""):
    """ Creates html of links based on metadata
    @param separator (separates instances of links)
    @param prefix
    @param suffix
    @param show_icons default = no
    @param style options CSS style for link
    """
    _ = gettext_set_language(bfo.lang)
    if style != "":
        style = ' class="' + style + '"'

    links = []

    # ADS/CDS/KEKSCAN/INIS/HAL links
    # external identifiers in tag 035__a along with service label in 035__9
    identifiers = bfo.fields("035__")
    adslinked = False
    for ident in identifiers:
        provenance = ident.get("9", None)
        extid = ident.get("a", None)
        if provenance is None or extid is None:
            continue

        if provenance == "KEKSCAN":
            extid = extid.replace("-", "")
            if len(extid) == 7 and not extid.startswith("19") and not extid.startswith("20"):
                year = "19" + extid[:2]
            elif len(extid) == 9:
                year = extid[:4]
                extid = extid[2:]
            else:
                # likely bad id
                continue
            yymm = extid[:4]
            links.append('<a%s href="%s/%s/%s/%s.pdf"> KEK scanned document</a>' % (style, KEK, year, yymm, extid))
        elif provenance == "CDS":
            links.append('<a%s href="%s%s"> CERN Document Server</a>' % (style, CDS, extid))
        elif provenance == "ADS":
            extid = extid.replace("&", "%26")  # A&A etc.
            links.append('<a%s href="%s%s"> ADS Abstract Service</a>' % (style, ADSABS, extid))
            adslinked = True
        elif provenance == "INIS":
            links.append('<a%s href="%s%s"> INIS Repository</a>' % (style, INIS, extid))
        elif provenance == "HAL":
            from invenio.webuser import isUserAdmin

            if isUserAdmin(bfo.user_info):
                links.append('<a %s href="%s%s"> HAL Archives Ouvertes</a>' % (style, HAL, extid))

    # fallback ADS link via arXiv:e-print
    if not adslinked:
        identifiers = bfo.fields("037__")
        eprints = set()  # avoid duplicate links
        for ident in identifiers:
            if ident.get("9", "") == "arXiv" and ident.get("a", None) is not None:
                eprints.add(ident.get("a", ""))
        if eprints:
            adslinked = True
            for eprint in eprints:
                links.append('<a href="%s%s"> ADS Abstract Service</a>' % (ADSABS, eprint))

    # external identifiers in tag 035__a along with service label in 035__9
    urls = bfo.fields("035__")
    for url in urls:
        provenance = url.get("9", None)
        extid = url.get("a", None)
        if provenance is None or extid is None:
            continue

        provenance = provenance.lower()
        if provenance == "msnet":
            links.append('<a%s href="%s%s"> AMS MathSciNet</a>' % (style, MSNET, extid))
        elif provenance == "zblatt":
            links.append('<a%s href="%s%s"> zbMATH</a>' % (style, ZBLATT, extid))
        elif provenance == "euclid":
            links.append('<a%s href="%s%s"> Project Euclid</a>' % (style, EUCLID, extid))
        elif provenance == "osti":
            links.append('<a%s href="%s%s"> OSTI Information Bridge Server</a>' % (style, OSTI, extid))

    # now look for explicit URLs
    # might want to check that we aren't repeating things from above...
    # Note: excluding self-links
    urls = bfo.fields("8564_")
    allowed_doctypes = ["INSPIRE-PUBLIC", "SCOAP3", "PoS"]
    for url in urls:
        if url.get("y", "").lower() not in ("adsabs", "euclid", "msnet", "osti", "zblatt"):
            if ".png" not in url.get("u", "") and not (
                url.get("y", "").lower().startswith("fermilab")
                and bfo.field("710__g").lower() in ("atlas collaboration", "cms collaboration")
            ):
                if url.get("y", "").upper() != "DURHAM":
                    if (
                        url.get("u", "")
                        and url.get("y", "Fulltext").upper() != "DOI"
                        and not url.get("u", "").startswith(CFG_SITE_URL)
                    ):
                        links.append(
                            '<a %s href="%s">%s</a>'
                            % (style, url.get("u", ""), _lookup_url_name(bfo, url.get("y", "Fulltext")))
                        )
                    elif (
                        url.get("u", "").startswith(CFG_SITE_URL)
                        and (
                            url.get("u", "").lower().endswith(".pdf")
                            or url.get("u", "").lower().endswith(".pdf?subformat=pdfa")
                        )
                        and bibdocfile_url_to_bibdoc(url.get("u")).doctype in allowed_doctypes
                    ):
                        links.append(
                            '<a %s href="%s">%s</a>'
                            % (style, url.get("u", ""), _lookup_url_name(bfo, url.get("y", "Fulltext")))
                        )

    # put it all together
    if links:
        if show_icons.lower() == "yes":
            img = (
                '<img style="border:none" \
            src="%s/img/file-icon-text-12x16.gif" alt="%s"/>'
                % (CFG_BASE_URL, _("Download fulltext"))
            )
            links = [img + "<small>" + link + "</small>" for link in links]
        return prefix + separator.join(links) + suffix
    else:
        return default
Exemplo n.º 9
0
def format_element(bfo, default = '', separator = '; ', style = '', \
           show_icons = 'no', prefix='', suffix=''):
    """ Creates html of links based on metadata
    @param separator (separates instances of links)
    @param prefix
    @param suffix
    @param show_icons default = no
    @param style options CSS style for link
    """
    _ = gettext_set_language(bfo.lang)
    if style != "":
        style = 'class = "' + style + '"'

    links = []

    journals = bfo.fields('773')
    journal_doi = bfo.fields('0247_')
    # trivially take care of dois
    for journal in journals + journal_doi:
        journtitle = ''
        oa_type = bfo.kb('OALINKS', journal.get('n'), '').lower()
        if oa_type:
            final_style = style + ' class = "' + oa_type + '"'
        else:
            final_style = style
        if journal.get('a'):
            if journal.get('p'):
                journtitle = ' - ' + journal.get('p')
            links.append('<a ' + final_style + 'href="http://dx.doi.org/'\
                         + journal.get('a') + '">Journal Server</a>' + journtitle)

    # KEKSCAN links
    identifiers = bfo.fields('035__')

    for ident in identifiers:
        if ident['9'] == 'KEKSCAN':
            out = ident['a'].replace("-", "")
            links.append('<a href="http://www-lib.kek.jp/cgi-bin/img_index?' +
                         out + '"> KEK scanned document </a>')

    # CDS links
    identifiers = bfo.fields('035__')

    for ident in identifiers:
        if ident['9'] == 'CDS':
            links.append('<a href="http://cds.cern.ch/record/' + ident['a'] +
                         '"> CERN Document Server </a>')

    # could look for other publication info and calculate URls here

    # now look for explicit URLs
    # might want to check that we aren't repeating things from above...
    # Note: excluding self-links
    urls = bfo.fields('8564_')
    allowed_doctypes = ["INSPIRE-PUBLIC"]
    for url in urls:
        if url.get("u") and \
        url.get('y', 'Fulltext').upper() != "DOI" and not \
        url.get('u').startswith(CFG_SITE_URL):
            links.append('<a ' + style + \
            'href="' + url.get("u") + '">' + \
                  _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>')
        elif url.get("u").startswith(CFG_SITE_URL) and \
        bibdocfile_url_to_bibdoc(url.get('u')).doctype in allowed_doctypes and \
        url.get("u")[-3:].lower() == "pdf":
            links.append('<a ' + style + 'href="' + url.get("u") + '">' + \
            _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>')

    #put it all together
    if links:
        if show_icons.lower() == 'yes':
            img = '<img style="border:none" \
            src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \
            % (CFG_SITE_URL, _("Download fulltext"))
            links = [img + '<small>' + link + '</small>' for link in links]
        return prefix + separator.join(links) + suffix
    else:
        return default