def format_element(bfo, default='', separator='; ', style='', show_icons='no', prefix='', suffix=''): """ Creates html of links based on metadata @param separator (separates instances of links) @param prefix @param suffix @param show_icons default = no @param style options CSS style for link """ _ = gettext_set_language(bfo.lang) if style != "": style = ' class="' + style + '"' links = [] # ADS/CDS/KEKSCAN/HAL links # external identifiers in tag 035__a along with service label in 035__9 identifiers = bfo.fields('035__') adslinked = False for ident in identifiers: provenance = ident.get('9', None) extid = ident.get('a', None) if provenance is None or extid is None: continue if provenance == 'KEKSCAN': extid = extid.replace("-", "") links.append('<a%s href="%s%s"> KEK scanned document</a>' % (style, KEK, extid)) elif provenance == 'CDS': links.append('<a%s href="%s%s"> CERN Document Server</a>' % (style, CDS, extid)) elif provenance == 'ADS': extid = extid.replace('&', '%26') # A&A etc. links.append('<a%s href="%s%s"> ADS Abstract Service</a>' % (style, ADSABS, extid)) adslinked = True elif provenance == 'HAL': from invenio.webuser import isUserAdmin if isUserAdmin(bfo.user_info): links.append('<a %s href="%s%s"> HAL Archives Ouvertes</a>' % (style, HAL, extid)) # fallback ADS link via arXiv:e-print if not adslinked: identifiers = bfo.fields('037__') eprints = set() # avoid duplicate links for ident in identifiers: if ident.get('9', '') == 'arXiv' \ and ident.get('a', None) is not None: eprints.add(ident.get('a', '')) if eprints: adslinked = True for eprint in eprints: links.append('<a href="%s%s"> ADS Abstract Service</a>' % (ADSABS, eprint)) # external identifiers in tag 035__a along with service label in 035__9 urls = bfo.fields('035__') for url in urls: provenance = url.get('9', None) extid = url.get('a', None) if provenance is None or extid is None: continue provenance = provenance.lower() if provenance == "msnet": links.append('<a%s href="%s%s"> AMS MathSciNet</a>' % (style, MSNET, extid)) elif provenance == "zblatt": links.append('<a%s href="%s%s"> zbMATH</a>' % (style, ZBLATT, extid)) elif provenance == "euclid": links.append('<a%s href="%s%s"> Project Euclid</a>' % (style, EUCLID, extid)) elif provenance == "osti": links.append('<a%s href="%s%s"> OSTI Information Bridge Server</a>' % (style, OSTI, extid)) # now look for explicit URLs # might want to check that we aren't repeating things from above... # Note: excluding self-links urls = bfo.fields('8564_') allowed_doctypes = ["INSPIRE-PUBLIC", "SCOAP3", "PoS"] for url in urls: if url.get("y", "").lower() not in \ ("adsabs", "euclid", "msnet", "osti", "zblatt"): if '.png' not in url.get('u', '') and not ( url.get('y', '').lower().startswith("fermilab") and bfo.field("710__g").lower() in ('atlas collaboration', 'cms collaboration')): if url.get('y', '').upper() != "DURHAM": if url.get("u", '') and \ url.get('y', 'Fulltext').upper() != "DOI" and not \ url.get('u', '').startswith(CFG_SITE_URL): links.append('<a %s href="%s">%s</a>' % (style, url.get("u", ''), _lookup_url_name(bfo, url.get( 'y', 'Fulltext')))) elif url.get("u", '').startswith(CFG_SITE_URL) and \ (url.get("u", '').lower().endswith(".pdf") or url.get("u", '').lower().endswith( '.pdf?subformat=pdfa')) and \ bibdocfile_url_to_bibdoc(url.get('u')).doctype in \ allowed_doctypes: links.append('<a %s href="%s">%s</a>' % (style, url.get("u", ''), _lookup_url_name(bfo, url.get( 'y', 'Fulltext')))) # put it all together if links: if show_icons.lower() == 'yes': img = '<img style="border:none" \ src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \ % (CFG_BASE_URL, _("Download fulltext")) links = [img + '<small>' + link + '</small>' for link in links] return prefix + separator.join(links) + suffix else: return default
def format_element(bfo, default='', separator='; ', style='', \ show_icons='no', prefix='', suffix=''): """ Creates html of links based on metadata @param separator (separates instances of links) @param prefix @param suffix @param show_icons default = no @param style options CSS style for link """ _ = gettext_set_language(bfo.lang) if style != "": style = 'class = "' + style + '"' links = [] # KEKSCAN/CDS links identifiers = bfo.fields('035__') for ident in identifiers: if ident.get('9', '') == 'KEKSCAN' and ident.get('a', None) is not None: out = ident['a'].replace("-", "") links.append('<a href="http://www-lib.kek.jp/cgi-bin/img_index?' + out + '"> KEK scanned document </a>') if ident.get('9', '') == 'CDS' and ident.get('a', None) is not None: links.append('<a href="http://cds.cern.ch/record/' + ident['a'] + '"> CERN Document Server </a>') # ADS links identifiers = bfo.fields('037__') current_links = bfo.field('8564_y') for ident in identifiers: if ident.get('9', '') == 'arXiv' and not ("ADSABS" in current_links) and ident.get( 'a', None) is not None: links.append( '<a href="http://adsabs.harvard.edu/cgi-bin/basic_connect?qsearch=' + ident.get('a', '') + '">ADS Abstract Service</a>') #links moved to new field 035 urls = bfo.fields('035__') allowed_doctypes = ["INSPIRE-PUBLIC"] for url in urls: if "9" in url and "a" in url: if url["9"].lower() == "msnet": links.append( '<a ' + style + ' href="http://www.ams.org/mathscinet-getitem?mr=' + url["a"] + '">AMS MathSciNet</a>') if url["9"].lower() == "zblatt": links.append( '<a ' + style + ' href="http://www.zentralblatt-math.org/zmath/en/search/?an=' + url["a"] + '">zbMATH</a>') if url["9"].lower() == "euclid": links.append('<a ' + style + ' href="http://projecteuclid.org/euclid.cmp/=' + url["a"] + '">Project Euclid</a>') # now look for explicit URLs # might want to check that we aren't repeating things from above... # Note: excluding self-links urls = bfo.fields('8564_') allowed_doctypes = ["INSPIRE-PUBLIC"] for url in urls: if url.get("y", "").lower() not in ("msnet", "zblatt", "euclid"): if '.png' not in url['u'] and not \ (url.get('y', '').lower().startswith("fermilab") and bfo.field("710__g").lower() in ('atlas collaboration', 'cms collaboration')): if url.get('y', '').upper() != "DURHAM": if url.get("u") and \ url.get('y', 'Fulltext').upper() != "DOI" and not \ url.get('u').startswith(CFG_SITE_URL): links.append('<a ' + style + \ 'href="' + url.get("u") + '">' + \ _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>') elif url.get("u").startswith(CFG_SITE_URL) and \ url.get("u")[-3:].lower() == "pdf" and bibdocfile_url_to_bibdoc(url.get('u')).doctype in allowed_doctypes: links.append('<a ' + style + 'href="' + url.get("u") + '">' + \ _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>') #put it all together if links: if show_icons.lower() == 'yes': img = '<img style="border:none" \ src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \ % (CFG_BASE_URL, _("Download fulltext")) links = [img + '<small>' + link + '</small>' for link in links] return prefix + separator.join(links) + suffix else: return default
def format_element(bfo, default='', separator='; ', style='', \ show_icons='no', prefix='', suffix=''): """ Creates html of links based on metadata @param separator (separates instances of links) @param prefix @param suffix @param show_icons default = no @param style options CSS style for link """ _ = gettext_set_language(bfo.lang) if style != "": style = 'class = "' + style + '"' links = [] # KEKSCAN links identifiers = bfo.fields('035__') for ident in identifiers: if ident['9'] == 'KEKSCAN': out = ident['a'].replace("-", "") links.append('<a href="http://www-lib.kek.jp/cgi-bin/img_index?' + out + '"> KEK scanned document </a>') # CDS links for ident in identifiers: if ident['9'] == 'CDS': links.append('<a href="http://cds.cern.ch/record/' + ident['a'] + '"> CERN Document Server </a>') # ADS links identifiers = bfo.fields('037__') current_links = bfo.field('8564_y') for ident in identifiers: if ident.get('9', '') == 'arXiv' and not ("ADSABS" in current_links): links.append('<a href="http://adsabs.harvard.edu/cgi-bin/basic_connect?qsearch=' + ident.get('a', '') + '">ADS Abstract Service</a>') # could look for other publication info and calculate URls here #links moved to new field 035 urls = bfo.fields('035__') allowed_doctypes = ["INSPIRE-PUBLIC"] for url in urls: if "9" in url: if url["9"].lower() == "msnet": links.append('<a ' + style + ' href="http://www.ams.org/mathscinet-getitem?mr=' + url["a"] + '">AMS MathSciNet</a>') if url["9"].lower() == "zblatt": links.append('<a ' + style + ' href="http://www.zentralblatt-math.org/zmath/en/search/?an=' + url["a"] + '">zbMATH</a>') if url["9"].lower() == "euclid": links.append('<a ' + style + ' href="http://projecteuclid.org/euclid.cmp/=' + url["a"] + '">Project Euclid</a>') # now look for explicit URLs # might want to check that we aren't repeating things from above... # Note: excluding self-links urls = bfo.fields('8564_') allowed_doctypes = ["INSPIRE-PUBLIC"] for url in urls: if url.get("y", "").lower() not in ("msnet", "zblatt", "euclid"): if '.png' not in url['u'] and not \ (url.get('y', '').lower().startswith("fermilab") and bfo.field("710__g").lower() in ('atlas collaboration', 'cms collaboration')): if url.get('y', '').upper() != "DURHAM": if url.get("u") and \ url.get('y', 'Fulltext').upper() != "DOI" and not \ url.get('u').startswith(CFG_SITE_URL): links.append('<a ' + style + \ 'href="' + url.get("u") + '">' + \ _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>') elif url.get("u").startswith(CFG_SITE_URL) and \ url.get("u")[-3:].lower() == "pdf" and bibdocfile_url_to_bibdoc(url.get('u')).doctype in allowed_doctypes: links.append('<a ' + style + 'href="' + url.get("u") + '">' + \ _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>') #put it all together if links: if show_icons.lower() == 'yes': img = '<img style="border:none" \ src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \ % (CFG_BASE_URL, _("Download fulltext")) links = [img + '<small>' + link + '</small>' for link in links] return prefix + separator.join(links) + suffix else: return default
def get_words_from_fulltext(self, url_direct_or_indirect): """Returns all the words contained in the document specified by URL_DIRECT_OR_INDIRECT with the words being split by various SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF file. (This is interesting to index Indico for example.) Note also that URL_DIRECT_OR_INDIRECT may be either a direct URL to the fulltext file or an URL to a setlink-like page body that presents the links to be indexed. In the latter case the URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs to fulltext documents, for all knows file extensions as specified by global CONV_PROGRAMS config variable. """ write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2) try: if bibdocfile_url_p(url_direct_or_indirect): write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2) try: bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect) except InvenioBibDocFileError: # Outdated 8564 tag return [] indexer = get_idx_indexer('fulltext') if indexer != 'native': # A document might belong to multiple records for rec_link in bibdoc.bibrec_links: recid = rec_link["recid"] # Adds fulltexts of all files once per records if not recid in fulltext_added: bibrecdocs = BibRecDocs(recid) try: text = bibrecdocs.get_text() except InvenioBibDocFileError: # Invalid PDF continue if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(recid, text) elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: xapian_add(recid, 'fulltext', text) fulltext_added.add(recid) # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: return [] else: text = "" if hasattr(bibdoc, "get_text"): text = bibdoc.get_text() return self.tokenize_for_words_default(text) else: if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY: write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2) return [] write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2) urls_to_index = set() for splash_re, url_re in CFG_BIBINDEX_SPLASH_PAGES.iteritems(): if re.match(splash_re, url_direct_or_indirect): write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2) html = urllib2.urlopen(url_direct_or_indirect).read() urls = get_links_in_html_page(html) write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3) for url in urls: if re.match(url_re, url): write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2) urls_to_index.add(url) if not urls_to_index: urls_to_index.add(url_direct_or_indirect) write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2) words = {} for url in urls_to_index: tmpdoc = download_url(url) file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel() if self.verbose > 3: file_converter_logger.setLevel(logging.DEBUG) try: try: tmptext = convert_file(tmpdoc, output_format='.txt') text = open(tmptext).read() os.remove(tmptext) indexer = get_idx_indexer('fulltext') if indexer != 'native': if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(None, text) # FIXME: use real record ID if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: #xapian_add(None, 'fulltext', text) # FIXME: use real record ID pass # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: tmpwords = [] else: tmpwords = self.tokenize_for_words_default(text) words.update(dict(map(lambda x: (x, 1), tmpwords))) except Exception, e: message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) finally: os.remove(tmpdoc) if self.verbose > 3: file_converter_logger.setLevel(old_logging_level) return words.keys() except Exception, e: message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) return []
def format_element(bfo, default="", separator="; ", style="", show_icons="no", prefix="", suffix=""): """ Creates html of links based on metadata @param separator (separates instances of links) @param prefix @param suffix @param show_icons default = no @param style options CSS style for link """ _ = gettext_set_language(bfo.lang) if style != "": style = 'class = "' + style + '"' links = [] from invenio.bibformat_elements.bfe_INSPIRE_arxiv import format_element as arxiv # if show_icons.lower() == 'yes': # mirrors = "no" # else: # mirrors = "yes" arxiv_links = arxiv(bfo, links="yes", mirrors="no") if arxiv_links: links.append(arxiv_links) journals = bfo.fields("773") # trivially take care of dois for journal in journals: journtitle = "" oa_type = bfo.kb("OALINKS", journal.get("n"), "").lower() if oa_type: final_style = style + ' class = "' + oa_type + '"' else: final_style = style if journal.get("a"): if journal.get("p"): journtitle = " - " + journal.get("p") links.append( "<a " + final_style + 'href="http://dx.doi.org/' + journal.get("a") + '">Journal Server</a>' + journtitle ) # KEKSCAN links identifiers = bfo.fields("035__") for ident in identifiers: if ident["9"] == "KEKSCAN": out = ident["a"].replace("-", "") links.append('<a href="http://www-lib.kek.jp/cgi-bin/img_index?' + out + '"> KEK scanned document </a>') # CDS links identifiers = bfo.fields("035__") for ident in identifiers: if ident["9"] == "CDS": links.append('<a href="http://cds.cern.ch/record/' + ident["a"] + '"> CERN Document Server </a>') # could look for other publication info and calculate URls here # now look for explicit URLs # might want to check that we aren't repeating things from above... # Note: excluding self-links urls = bfo.fields("8564_") allowed_doctypes = ["INSPIRE-PUBLIC"] for url in urls: if url.get("u") and url.get("y", "Fulltext").upper() != "DOI" and not url.get("u").startswith(CFG_SITE_URL): links.append( "<a " + style + 'href="' + url.get("u") + '">' + _lookup_url_name(bfo, url.get("y", "Fulltext")) + "</a>" ) elif ( url.get("u").startswith(CFG_SITE_URL) and bibdocfile_url_to_bibdoc(url.get("u")).doctype in allowed_doctypes and url.get("u")[-3:].lower() == "pdf" ): links.append( "<a " + style + 'href="' + url.get("u") + '">' + _lookup_url_name(bfo, url.get("y", "Fulltext")) + "</a>" ) # put it all together if links: if show_icons.lower() == "yes": img = ( '<img style="border:none" \ src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' % (CFG_SITE_URL, _("Download fulltext")) ) links = [img + "<small>" + link + "</small>" for link in links] return prefix + separator.join(links) + suffix else: return default
def format_element(bfo, default='', separator='; ', style='', show_icons='no', prefix='', suffix=''): """ Creates html of links based on metadata @param separator (separates instances of links) @param prefix @param suffix @param show_icons default = no @param style options CSS style for link """ _ = gettext_set_language(bfo.lang) if style != "": style = ' class="' + style + '"' links = [] # ADS/CDS/KEKSCAN/INIS/HAL links # external identifiers in tag 035__a along with service label in 035__9 identifiers = bfo.fields('035__') adslinked = False for ident in identifiers: provenance = ident.get('9', None) extid = ident.get('a', None) if provenance is None or extid is None: continue if provenance == 'KEKSCAN': extid = extid.replace("-", "") if len(extid) == 7 and not extid.startswith( '19') and not extid.startswith('20'): year = '19' + extid[:2] elif len(extid) == 9: year = extid[:4] extid = extid[2:] else: # likely bad id continue yymm = extid[:4] links.append( '<a%s href="%s/%s/%s/%s.pdf"> KEK scanned document</a>' % (style, KEK, year, yymm, extid)) elif provenance == 'CDS': links.append('<a%s href="%s%s"> CERN Document Server</a>' % (style, CDS, extid)) elif provenance == 'ADS': extid = extid.replace('&', '%26') # A&A etc. links.append('<a%s href="%s%s"> ADS Abstract Service</a>' % (style, ADSABS, extid)) adslinked = True elif provenance == 'INIS': links.append('<a%s href="%s%s"> INIS Repository</a>' % (style, INIS, extid)) elif provenance == 'HAL': from invenio.webuser import isUserAdmin if isUserAdmin(bfo.user_info): links.append('<a %s href="%s%s"> HAL Archives Ouvertes</a>' % (style, HAL, extid)) # fallback ADS link via arXiv:e-print if not adslinked: identifiers = bfo.fields('037__') eprints = set() # avoid duplicate links for ident in identifiers: if ident.get('9', '') == 'arXiv' \ and ident.get('a', None) is not None: eprints.add(ident.get('a', '')) if eprints: adslinked = True for eprint in eprints: links.append('<a href="%s%s"> ADS Abstract Service</a>' % (ADSABS, eprint)) # external identifiers in tag 035__a along with service label in 035__9 urls = bfo.fields('035__') for url in urls: provenance = url.get('9', None) extid = url.get('a', None) if provenance is None or extid is None: continue provenance = provenance.lower() if provenance == "msnet": links.append('<a%s href="%s%s"> AMS MathSciNet</a>' % (style, MSNET, extid)) elif provenance == "zblatt": links.append('<a%s href="%s%s"> zbMATH</a>' % (style, ZBLATT, extid)) elif provenance == "euclid": links.append('<a%s href="%s%s"> Project Euclid</a>' % (style, EUCLID, extid)) elif provenance == "osti": links.append( '<a%s href="%s%s"> OSTI Information Bridge Server</a>' % (style, OSTI, extid)) # now look for explicit URLs # might want to check that we aren't repeating things from above... # Note: excluding self-links urls = bfo.fields('8564_') allowed_doctypes = ["INSPIRE-PUBLIC", "SCOAP3", "PoS"] for url in urls: if url.get("y", "").lower() not in \ ("adsabs", "euclid", "msnet", "osti", "zblatt"): if '.png' not in url.get('u', '') and not ( url.get('y', '').lower().startswith("fermilab") and bfo.field("710__g").lower() in ('atlas collaboration', 'cms collaboration')): if url.get('y', '').upper() != "DURHAM": if url.get("u", '') and \ url.get('y', 'Fulltext').upper() != "DOI" and not \ url.get('u', '').startswith(CFG_SITE_URL): links.append( '<a %s href="%s">%s</a>' % (style, url.get("u", ''), _lookup_url_name(bfo, url.get('y', 'Fulltext')))) elif url.get("u", '').startswith(CFG_SITE_URL) and \ (url.get("u", '').lower().endswith(".pdf") or url.get("u", '').lower().endswith( '.pdf?subformat=pdfa')) and \ bibdocfile_url_to_bibdoc(url.get('u')).doctype in \ allowed_doctypes: links.append( '<a %s href="%s">%s</a>' % (style, url.get("u", ''), _lookup_url_name(bfo, url.get('y', 'Fulltext')))) # put it all together if links: if show_icons.lower() == 'yes': img = '<img style="border:none" \ src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \ % (CFG_BASE_URL, _("Download fulltext")) links = [img + '<small>' + link + '</small>' for link in links] return prefix + separator.join(links) + suffix else: return default
def get_words_from_fulltext(self, url_direct_or_indirect): """Returns all the words contained in the document specified by URL_DIRECT_OR_INDIRECT with the words being split by various SRE_SEPARATORS regexp set earlier. If FORCE_FILE_EXTENSION is set (e.g. to "pdf", then treat URL_DIRECT_OR_INDIRECT as a PDF file. (This is interesting to index Indico for example.) Note also that URL_DIRECT_OR_INDIRECT may be either a direct URL to the fulltext file or an URL to a setlink-like page body that presents the links to be indexed. In the latter case the URL_DIRECT_OR_INDIRECT is parsed to extract actual direct URLs to fulltext documents, for all knows file extensions as specified by global CONV_PROGRAMS config variable. """ write_message("... reading fulltext files from %s started" % url_direct_or_indirect, verbose=2) try: if bibdocfile_url_p(url_direct_or_indirect): write_message("... %s is an internal document" % url_direct_or_indirect, verbose=2) try: bibdoc = bibdocfile_url_to_bibdoc(url_direct_or_indirect) except InvenioBibDocFileError: # Outdated 8564 tag return [] indexer = get_idx_indexer('fulltext') if indexer != 'native': # A document might belong to multiple records for rec_link in bibdoc.bibrec_links: recid = rec_link["recid"] # Adds fulltexts of all files once per records if not recid in fulltext_added: bibrecdocs = BibRecDocs(recid) try: text = bibrecdocs.get_text() except InvenioBibDocFileError: # Invalid PDF continue if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(recid, text) elif indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: xapian_add(recid, 'fulltext', text) fulltext_added.add(recid) # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: return [] else: text = "" if hasattr(bibdoc, "get_text"): text = bibdoc.get_text() return self.tokenize_for_words_default(text) else: if CFG_BIBINDEX_FULLTEXT_INDEX_LOCAL_FILES_ONLY: write_message("... %s is external URL but indexing only local files" % url_direct_or_indirect, verbose=2) return [] write_message("... %s is an external URL" % url_direct_or_indirect, verbose=2) urls_to_index = set() for splash_re, url_re in CFG_BIBINDEX_SPLASH_PAGES.iteritems(): if re.match(splash_re, url_direct_or_indirect): write_message("... %s is a splash page (%s)" % (url_direct_or_indirect, splash_re), verbose=2) html = urllib2.urlopen(url_direct_or_indirect).read() urls = get_links_in_html_page(html) write_message("... found these URLs in %s splash page: %s" % (url_direct_or_indirect, ", ".join(urls)), verbose=3) for url in urls: if re.match(url_re, url): write_message("... will index %s (matched by %s)" % (url, url_re), verbose=2) urls_to_index.add(url) if not urls_to_index: urls_to_index.add(url_direct_or_indirect) write_message("... will extract words from %s" % ', '.join(urls_to_index), verbose=2) words = {} for url in urls_to_index: tmpdoc = download_url(url) file_converter_logger = get_file_converter_logger() old_logging_level = file_converter_logger.getEffectiveLevel() if self.verbose > 3: file_converter_logger.setLevel(logging.DEBUG) try: try: tmptext = convert_file(tmpdoc, output_format='.txt') text = open(tmptext).read() os.remove(tmptext) indexer = get_idx_indexer('fulltext') if indexer != 'native': if indexer == 'SOLR' and CFG_SOLR_URL: solr_add_fulltext(None, text) # FIXME: use real record ID if indexer == 'XAPIAN' and CFG_XAPIAN_ENABLED: #xapian_add(None, 'fulltext', text) # FIXME: use real record ID pass # we are relying on an external information retrieval system # to provide full-text indexing, so dispatch text to it and # return nothing here: tmpwords = [] else: tmpwords = self.tokenize_for_words_default(text) words.update(dict(map(lambda x: (x, 1), tmpwords))) except Exception, e: message = 'ERROR: it\'s impossible to correctly extract words from %s referenced by %s: %s' % (url, url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) finally: os.remove(tmpdoc) if self.verbose > 3: file_converter_logger.setLevel(old_logging_level) return words.keys() except Exception, e: message = 'ERROR: it\'s impossible to correctly extract words from %s: %s' % (url_direct_or_indirect, e) register_exception(prefix=message, alert_admin=True) write_message(message, stream=sys.stderr) return []
def format_element(bfo, default="", separator="; ", style="", show_icons="no", prefix="", suffix=""): """ Creates html of links based on metadata @param separator (separates instances of links) @param prefix @param suffix @param show_icons default = no @param style options CSS style for link """ _ = gettext_set_language(bfo.lang) if style != "": style = ' class="' + style + '"' links = [] # ADS/CDS/KEKSCAN/INIS/HAL links # external identifiers in tag 035__a along with service label in 035__9 identifiers = bfo.fields("035__") adslinked = False for ident in identifiers: provenance = ident.get("9", None) extid = ident.get("a", None) if provenance is None or extid is None: continue if provenance == "KEKSCAN": extid = extid.replace("-", "") if len(extid) == 7 and not extid.startswith("19") and not extid.startswith("20"): year = "19" + extid[:2] elif len(extid) == 9: year = extid[:4] extid = extid[2:] else: # likely bad id continue yymm = extid[:4] links.append('<a%s href="%s/%s/%s/%s.pdf"> KEK scanned document</a>' % (style, KEK, year, yymm, extid)) elif provenance == "CDS": links.append('<a%s href="%s%s"> CERN Document Server</a>' % (style, CDS, extid)) elif provenance == "ADS": extid = extid.replace("&", "%26") # A&A etc. links.append('<a%s href="%s%s"> ADS Abstract Service</a>' % (style, ADSABS, extid)) adslinked = True elif provenance == "INIS": links.append('<a%s href="%s%s"> INIS Repository</a>' % (style, INIS, extid)) elif provenance == "HAL": from invenio.webuser import isUserAdmin if isUserAdmin(bfo.user_info): links.append('<a %s href="%s%s"> HAL Archives Ouvertes</a>' % (style, HAL, extid)) # fallback ADS link via arXiv:e-print if not adslinked: identifiers = bfo.fields("037__") eprints = set() # avoid duplicate links for ident in identifiers: if ident.get("9", "") == "arXiv" and ident.get("a", None) is not None: eprints.add(ident.get("a", "")) if eprints: adslinked = True for eprint in eprints: links.append('<a href="%s%s"> ADS Abstract Service</a>' % (ADSABS, eprint)) # external identifiers in tag 035__a along with service label in 035__9 urls = bfo.fields("035__") for url in urls: provenance = url.get("9", None) extid = url.get("a", None) if provenance is None or extid is None: continue provenance = provenance.lower() if provenance == "msnet": links.append('<a%s href="%s%s"> AMS MathSciNet</a>' % (style, MSNET, extid)) elif provenance == "zblatt": links.append('<a%s href="%s%s"> zbMATH</a>' % (style, ZBLATT, extid)) elif provenance == "euclid": links.append('<a%s href="%s%s"> Project Euclid</a>' % (style, EUCLID, extid)) elif provenance == "osti": links.append('<a%s href="%s%s"> OSTI Information Bridge Server</a>' % (style, OSTI, extid)) # now look for explicit URLs # might want to check that we aren't repeating things from above... # Note: excluding self-links urls = bfo.fields("8564_") allowed_doctypes = ["INSPIRE-PUBLIC", "SCOAP3", "PoS"] for url in urls: if url.get("y", "").lower() not in ("adsabs", "euclid", "msnet", "osti", "zblatt"): if ".png" not in url.get("u", "") and not ( url.get("y", "").lower().startswith("fermilab") and bfo.field("710__g").lower() in ("atlas collaboration", "cms collaboration") ): if url.get("y", "").upper() != "DURHAM": if ( url.get("u", "") and url.get("y", "Fulltext").upper() != "DOI" and not url.get("u", "").startswith(CFG_SITE_URL) ): links.append( '<a %s href="%s">%s</a>' % (style, url.get("u", ""), _lookup_url_name(bfo, url.get("y", "Fulltext"))) ) elif ( url.get("u", "").startswith(CFG_SITE_URL) and ( url.get("u", "").lower().endswith(".pdf") or url.get("u", "").lower().endswith(".pdf?subformat=pdfa") ) and bibdocfile_url_to_bibdoc(url.get("u")).doctype in allowed_doctypes ): links.append( '<a %s href="%s">%s</a>' % (style, url.get("u", ""), _lookup_url_name(bfo, url.get("y", "Fulltext"))) ) # put it all together if links: if show_icons.lower() == "yes": img = ( '<img style="border:none" \ src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' % (CFG_BASE_URL, _("Download fulltext")) ) links = [img + "<small>" + link + "</small>" for link in links] return prefix + separator.join(links) + suffix else: return default
def format_element(bfo, default = '', separator = '; ', style = '', \ show_icons = 'no', prefix='', suffix=''): """ Creates html of links based on metadata @param separator (separates instances of links) @param prefix @param suffix @param show_icons default = no @param style options CSS style for link """ _ = gettext_set_language(bfo.lang) if style != "": style = 'class = "' + style + '"' links = [] journals = bfo.fields('773') journal_doi = bfo.fields('0247_') # trivially take care of dois for journal in journals + journal_doi: journtitle = '' oa_type = bfo.kb('OALINKS', journal.get('n'), '').lower() if oa_type: final_style = style + ' class = "' + oa_type + '"' else: final_style = style if journal.get('a'): if journal.get('p'): journtitle = ' - ' + journal.get('p') links.append('<a ' + final_style + 'href="http://dx.doi.org/'\ + journal.get('a') + '">Journal Server</a>' + journtitle) # KEKSCAN links identifiers = bfo.fields('035__') for ident in identifiers: if ident['9'] == 'KEKSCAN': out = ident['a'].replace("-", "") links.append('<a href="http://www-lib.kek.jp/cgi-bin/img_index?' + out + '"> KEK scanned document </a>') # CDS links identifiers = bfo.fields('035__') for ident in identifiers: if ident['9'] == 'CDS': links.append('<a href="http://cds.cern.ch/record/' + ident['a'] + '"> CERN Document Server </a>') # could look for other publication info and calculate URls here # now look for explicit URLs # might want to check that we aren't repeating things from above... # Note: excluding self-links urls = bfo.fields('8564_') allowed_doctypes = ["INSPIRE-PUBLIC"] for url in urls: if url.get("u") and \ url.get('y', 'Fulltext').upper() != "DOI" and not \ url.get('u').startswith(CFG_SITE_URL): links.append('<a ' + style + \ 'href="' + url.get("u") + '">' + \ _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>') elif url.get("u").startswith(CFG_SITE_URL) and \ bibdocfile_url_to_bibdoc(url.get('u')).doctype in allowed_doctypes and \ url.get("u")[-3:].lower() == "pdf": links.append('<a ' + style + 'href="' + url.get("u") + '">' + \ _lookup_url_name(bfo, url.get('y', 'Fulltext')) + '</a>') #put it all together if links: if show_icons.lower() == 'yes': img = '<img style="border:none" \ src="%s/img/file-icon-text-12x16.gif" alt="%s"/>' \ % (CFG_SITE_URL, _("Download fulltext")) links = [img + '<small>' + link + '</small>' for link in links] return prefix + separator.join(links) + suffix else: return default