Exemplo n.º 1
0
def download(url, localFileName=None):
    """ Downloads a file from a remote url
    """
    localName = url2name(url)
    req = urllib2.Request(url)
    r = make_invenio_opener('BibEncode').open(req)
    if 'Content-Disposition' in r.info():
        # If the response has Content-Disposition, we take file name from it
        localName = r.info()['Content-Disposition'].split('filename=')[1]
        if localName[0] == '"' or localName[0] == "'":
            localName = localName[1:-1]
    elif r.url != url:
        # if we were redirected, the real file name we take from the final URL
        localName = url2name(r.url)
    if localFileName:
        # we can force to save the file as specified name
        localName = localFileName
    f = open(localName, 'wb')
    shutil.copyfileobj(r, f)
    f.close()
Exemplo n.º 2
0
def download(url, localFileName=None):
    """ Downloads a file from a remote url
    """
    localName = url2name(url)
    req = urllib2.Request(url)
    r = make_invenio_opener("BibEncode").open(req)
    if "Content-Disposition" in r.info():
        # If the response has Content-Disposition, we take file name from it
        localName = r.info()["Content-Disposition"].split("filename=")[1]
        if localName[0] == '"' or localName[0] == "'":
            localName = localName[1:-1]
    elif r.url != url:
        # if we were redirected, the real file name we take from the final URL
        localName = url2name(r.url)
    if localFileName:
        # we can force to save the file as specified name
        localName = localFileName
    f = open(localName, "wb")
    shutil.copyfileobj(r, f)
    f.close()
Exemplo n.º 3
0
"""

from __future__ import print_function

__revision__ = "$Id$"

import pprint
import sys
import re
import getopt
from invenio.legacy.search_engine import perform_request_search
from invenio.legacy.bibrecord import get_fieldvalues
from invenio.config import CFG_CERN_SITE
from invenio.utils.url import make_invenio_opener

BIBFORMAT_OPENER = make_invenio_opener('BibFormat')

if CFG_CERN_SITE:
    journal_name_tag = '773__p'
else:
    journal_name_tag = '909C4p'

issns = {
    'aapps bull.': '0218-2203',
    'account. manag. inf. technol.': '0959-8022',
    'acm comput. surv.': '0360-0300',
    'acm sigplan not.': '0362-1340',
    'acm trans. comput. syst.': '0734-2071',
    'acm trans. comput.-hum. interact.': '1073-0516',
    'acm trans. database syst.': '0362-5915',
    'acm trans. graph.': '0730-0301',
Exemplo n.º 4
0
from invenio.modules.classifier.errors import TaxonomyError

log = bconfig.get_logger("bibclassify.ontology_reader")
from invenio import config

from invenio.modules.classifier.registry import taxonomies

# only if not running in a stanalone mode
if bconfig.STANDALONE:
    dbquery = None
    from urllib2 import urlopen
else:
    from invenio.legacy import dbquery
    from invenio.utils.url import make_invenio_opener

    urlopen = make_invenio_opener('BibClassify').open

_contains_digit = re.compile("\d")
_starts_with_non = re.compile("(?i)^non[a-z]")
_starts_with_anti = re.compile("(?i)^anti[a-z]")
_split_by_punctuation = re.compile("(\W+)")

_CACHE = {}


def get_cache(taxonomy_id):
    """Return thread-safe cache for the given taxonomy id.

    :param taxonomy_id: identifier of the taxonomy
    :type taxonomy_id: str
Exemplo n.º 5
0
import sys
import urllib
import urllib2
import datetime
from xml.dom.minidom import parse
from time import sleep

from invenio.config import CFG_ETCDIR, CFG_CROSSREF_USERNAME, \
 CFG_CROSSREF_PASSWORD, CFG_CROSSREF_EMAIL
from invenio.legacy.bibconvert.xslt_engine import convert
from invenio.legacy.bibrecord import record_get_field_value
from invenio.utils.url import make_invenio_opener
from invenio.utils.json import json, json_unicode_to_utf8

CROSSREF_OPENER = make_invenio_opener('crossrefutils')

FIELDS_JOURNAL = 'issn,title,author,volume,issue,page,year,type,doi'.split(',')
FIELDS_BOOK = ('isbn,ser_title,vol_title,author,volume,edition_number,'
               + 'page,year,component_number,type,doi').split(',')

# Exceptions classes
class CrossrefError(Exception):
    """Crossref errors"""
    def __init__(self, code):
        """Initialisation"""
        self.code = code

    def __str__(self):
        """Returns error code"""
        return repr(self.code)
Exemplo n.º 6
0
Raises InvenioFileDownloadError exception.
"""

import urllib2
import time
import os
import socket
import urllib
import tempfile
import shutil
import sys

from invenio.utils.url import make_invenio_opener

URL_OPENER = make_invenio_opener('filedownloadutils')

from invenio.config import (CFG_TMPSHAREDDIR,
                            CFG_BIBUPLOAD_FFT_ALLOWED_LOCAL_PATHS,
                            CFG_WEBSUBMIT_STORAGEDIR)

#: block size when performing I/O.
CFG_FILEUTILS_BLOCK_SIZE = 1024 * 8


class InvenioFileDownloadError(Exception):
    """A generic download exception."""
    def __init__(self, msg, code=None):
        Exception.__init__(self, msg)
        self.code = code
Exemplo n.º 7
0
"""

from __future__ import print_function

__revision__ = "$Id$"

import pprint
import sys
import re
import getopt
from invenio.legacy.search_engine import perform_request_search
from invenio.legacy.bibrecord import get_fieldvalues
from invenio.config import CFG_CERN_SITE
from invenio.utils.url import make_invenio_opener

BIBFORMAT_OPENER = make_invenio_opener('BibFormat')

if CFG_CERN_SITE:
    journal_name_tag = '773__p'
else:
    journal_name_tag = '909C4p'

issns = {   'aapps bull.': '0218-2203',
            'account. manag. inf. technol.': '0959-8022',
            'acm comput. surv.': '0360-0300',
            'acm sigplan not.': '0362-1340',
            'acm trans. comput. syst.': '0734-2071',
            'acm trans. comput.-hum. interact.': '1073-0516',
            'acm trans. database syst.': '0362-5915',
            'acm trans. graph.': '0730-0301',
            'acm trans. inf. syst. secur.': '1094-9224',
Exemplo n.º 8
0
"""

__revision__ = "$Id$"

import getopt
import sys
import time
import re
import ConfigParser

from invenio.utils.url import make_invenio_opener
from invenio.config import CFG_ETCDIR
from invenio.legacy.dbquery import run_sql
from invenio.modules.ranker.registry import configuration

BIBRANK_OPENER = make_invenio_opener('BibRank')

opts_dict = {}
task_id = -1

def bibrankgkb(config):
    """Generates a .kb file based on input from the configuration file"""

    if opts_dict["verbose"] >= 1:
        write_message("Running: Generate Knowledgebase.")
    journals = {}
    journal_src = {}
    i = 0

    #Reading the configuration file
    while config.has_option("bibrankgkb","create_%s" % i):
Exemplo n.º 9
0
from invenio.modules.classifier.errors import TaxonomyError

log = bconfig.get_logger("bibclassify.ontology_reader")
from invenio import config

from invenio.modules.classifier.registry import taxonomies

# only if not running in a stanalone mode
if bconfig.STANDALONE:
    dbquery = None
    from urllib2 import urlopen
else:
    from invenio.legacy import dbquery
    from invenio.utils.url import make_invenio_opener

    urlopen = make_invenio_opener("BibClassify").open

_contains_digit = re.compile("\d")
_starts_with_non = re.compile("(?i)^non[a-z]")
_starts_with_anti = re.compile("(?i)^anti[a-z]")
_split_by_punctuation = re.compile("(\W+)")

_CACHE = {}


def get_cache(taxonomy_id):
    """Return thread-safe cache for the given taxonomy id.

    :param taxonomy_id: identifier of the taxonomy
    :type taxonomy_id: str
Exemplo n.º 10
0
import sys
import urllib
import urllib2
import datetime
from xml.dom.minidom import parse
from time import sleep

from invenio.config import CFG_ETCDIR, CFG_CROSSREF_USERNAME, \
 CFG_CROSSREF_PASSWORD, CFG_CROSSREF_EMAIL
from invenio.legacy.bibconvert.xslt_engine import convert
from invenio.legacy.bibrecord import record_get_field_value
from invenio.utils.url import make_invenio_opener
from invenio.utils.json import json, json_unicode_to_utf8

CROSSREF_OPENER = make_invenio_opener('crossrefutils')

FIELDS_JOURNAL = 'issn,title,author,volume,issue,page,year,type,doi'.split(',')
FIELDS_BOOK = ('isbn,ser_title,vol_title,author,volume,edition_number,' +
               'page,year,component_number,type,doi').split(',')


# Exceptions classes
class CrossrefError(Exception):
    """Crossref errors"""
    def __init__(self, code):
        """Initialisation"""
        self.code = code

    def __str__(self):
        """Returns error code"""
Exemplo n.º 11
0
from datetime import datetime, timedelta

from flask import current_app

from invenio_base.globals import cfg
from invenio.utils.url import make_invenio_opener

import rdflib

from six import iteritems
from six.moves import cPickle

from .errors import TaxonomyError
from .registry import taxonomies

urlopen = make_invenio_opener('classifier').open

_contains_digit = re.compile("\d")
_starts_with_non = re.compile("(?i)^non[a-z]")
_starts_with_anti = re.compile("(?i)^anti[a-z]")
_split_by_punctuation = re.compile("(\W+)")

_CACHE = {}


def get_cache(taxonomy_id):
    """Return thread-safe cache for the given taxonomy id.

    :param taxonomy_id: identifier of the taxonomy
    :type taxonomy_id: str
Exemplo n.º 12
0
from datetime import datetime, timedelta

from flask import current_app

from invenio.base.globals import cfg
from invenio.utils.url import make_invenio_opener

import rdflib

from six import iteritems
from six.moves import cPickle

from .errors import TaxonomyError
from .registry import taxonomies

urlopen = make_invenio_opener('classifier').open

_contains_digit = re.compile("\d")
_starts_with_non = re.compile("(?i)^non[a-z]")
_starts_with_anti = re.compile("(?i)^anti[a-z]")
_split_by_punctuation = re.compile("(\W+)")

_CACHE = {}


def get_cache(taxonomy_id):
    """Return thread-safe cache for the given taxonomy id.

    :param taxonomy_id: identifier of the taxonomy
    :type taxonomy_id: str
Exemplo n.º 13
0
"""

__revision__ = "$Id$"

import getopt
import sys
import time
import re
import ConfigParser

from invenio.utils.url import make_invenio_opener
from invenio.config import CFG_ETCDIR
from invenio.legacy.dbquery import run_sql
from invenio.modules.ranker.registry import configuration

BIBRANK_OPENER = make_invenio_opener('BibRank')

opts_dict = {}
task_id = -1


def bibrankgkb(config):
    """Generates a .kb file based on input from the configuration file"""

    if opts_dict["verbose"] >= 1:
        write_message("Running: Generate Knowledgebase.")
    journals = {}
    journal_src = {}
    i = 0

    #Reading the configuration file
Exemplo n.º 14
0
determine if a local file is a PDF file.

This module is STANDALONE safe
"""

import os
import re

from invenio.legacy.bibclassify import config as bconfig

if bconfig.STANDALONE:
    from urllib2 import urlopen
else:
    from invenio.utils.url import make_invenio_opener

    urlopen = make_invenio_opener('BibClassify').open

log = bconfig.get_logger("bibclassify.text_extractor")

_ONE_WORD = re.compile("[A-Za-z]{2,}")


def is_pdf(document):
    """Check if a document is a PDF file and return True if is is."""
    if not executable_exists('pdftotext'):
        log.warning("GNU file was not found on the system. "
                    "Switching to a weak file extension test.")
        if document.lower().endswith(".pdf"):
            return True
        return False
        # Tested with file version >= 4.10. First test is secure and works
Exemplo n.º 15
0
import urllib2, time, os, sys, re
from invenio.config import CFG_TMPDIR, \
                           CFG_PLOTEXTRACTOR_SOURCE_BASE_URL, \
                           CFG_PLOTEXTRACTOR_SOURCE_TARBALL_FOLDER, \
                           CFG_PLOTEXTRACTOR_SOURCE_PDF_FOLDER, \
                           CFG_PLOTEXTRACTOR_DOWNLOAD_TIMEOUT
from .config import CFG_PLOTEXTRACTOR_DESY_BASE, \
                                         CFG_PLOTEXTRACTOR_DESY_PIECE
from invenio.legacy.search_engine import get_record
from invenio.legacy.bibrecord import record_get_field_instances, \
                              field_get_subfield_values
from invenio.utils.shell import run_shell_command
from .output_utils import write_message
from invenio.utils.url import make_invenio_opener

PLOTEXTRACTOR_OPENER = make_invenio_opener('plotextractor')

PDF_EXTENSION = '.pdf'

ARXIV_HEADER = 'arXiv:'
HEP_EX = ['hep-ex/', 9405, ARXIV_HEADER + 'hep-ex_'] # experimental
# a note about hep-ex: the hep-ex papers from 9403 nad 9404 are stored
# in arXiv's servers as hep-ph
HEP_LAT = ['hep-lat/', 9107, ARXIV_HEADER + 'hep-lat_'] # lattice
HEP_PH = ['hep-ph/', 9203, ARXIV_HEADER + 'hep-ph_'] # phenomenology
HEP_TH = ['hep-th/', 9108, ARXIV_HEADER + 'hep-th_'] # theory

HEP_AREAS = [HEP_EX, HEP_LAT, HEP_PH, HEP_TH]

URL = 0
BEGIN_YEAR_MONTH_INDEX = 1
Exemplo n.º 16
0
from datetime import datetime, timedelta

from flask import current_app

from invenio.base.globals import cfg
from invenio.utils.url import make_invenio_opener

import rdflib

from six import iteritems
from six.moves import cPickle

from .errors import TaxonomyError
from .registry import taxonomies

urlopen = make_invenio_opener("classifier").open

_contains_digit = re.compile("\d")
_starts_with_non = re.compile("(?i)^non[a-z]")
_starts_with_anti = re.compile("(?i)^anti[a-z]")
_split_by_punctuation = re.compile("(\W+)")

_CACHE = {}


def get_cache(taxonomy_id):
    """Return thread-safe cache for the given taxonomy id.

    :param taxonomy_id: identifier of the taxonomy
    :type taxonomy_id: str
Exemplo n.º 17
0
import invenio.legacy.bibcirculation.db_layer as db
from invenio.legacy.bibcirculation.config import \
                                CFG_BIBCIRCULATION_WORKING_DAYS, \
                                CFG_BIBCIRCULATION_HOLIDAYS, \
                                CFG_CERN_SITE, \
                                CFG_BIBCIRCULATION_ITEM_STATUS_ON_LOAN, \
                                CFG_BIBCIRCULATION_ITEM_STATUS_ON_SHELF, \
                                CFG_BIBCIRCULATION_ITEM_STATUS_IN_PROCESS, \
                                CFG_BIBCIRCULATION_REQUEST_STATUS_PENDING, \
                                CFG_BIBCIRCULATION_REQUEST_STATUS_WAITING, \
                                CFG_BIBCIRCULATION_LOAN_STATUS_ON_LOAN, \
                                CFG_BIBCIRCULATION_LOAN_STATUS_EXPIRED, \
                                CFG_BIBCIRCULATION_LOAN_STATUS_RETURNED

DICC_REGEXP = re.compile("^\{('[^']*': ?('[^']*'|\"[^\"]+\"|[0-9]*|None)(, ?'[^']*': ?('[^']*'|\"[^\"]+\"|[0-9]*|None))*)?\}$")
BIBCIRCULATION_OPENER = make_invenio_opener('BibCirculation')

def search_user(column, string):
    if string is not None:
        string = string.strip()

    if CFG_CERN_SITE == 1:
        if column == 'name':
            result = db.search_borrower_by_name(string)
        else:
            if column == 'email':
                try:
                    result = db.search_borrower_by_email(string)
                except:
                    result = ()
            else:
Exemplo n.º 18
0
import invenio.legacy.bibcirculation.db_layer as db
from invenio.legacy.bibcirculation.config import \
                                CFG_BIBCIRCULATION_WORKING_DAYS, \
                                CFG_BIBCIRCULATION_HOLIDAYS, \
                                CFG_CERN_SITE, \
                                CFG_BIBCIRCULATION_ITEM_STATUS_ON_LOAN, \
                                CFG_BIBCIRCULATION_ITEM_STATUS_ON_SHELF, \
                                CFG_BIBCIRCULATION_ITEM_STATUS_IN_PROCESS, \
                                CFG_BIBCIRCULATION_REQUEST_STATUS_PENDING, \
                                CFG_BIBCIRCULATION_REQUEST_STATUS_WAITING, \
                                CFG_BIBCIRCULATION_LOAN_STATUS_ON_LOAN, \
                                CFG_BIBCIRCULATION_LOAN_STATUS_EXPIRED, \
                                CFG_BIBCIRCULATION_LOAN_STATUS_RETURNED

DICC_REGEXP = re.compile("^\{('[^']*': ?('[^']*'|\"[^\"]+\"|[0-9]*|None)(, ?'[^']*': ?('[^']*'|\"[^\"]+\"|[0-9]*|None))*)?\}$")
BIBCIRCULATION_OPENER = make_invenio_opener('BibCirculation')

def search_user(column, string):
    if string is not None:
        string = string.strip()

    if CFG_CERN_SITE == 1:
        if column == 'name':
            result = db.search_borrower_by_name(string)
        else:
            if column == 'email':
                try:
                    result = db.search_borrower_by_email(string)
                except:
                    result = ()
            else:
import urllib
import mimetools
import intbitset
from invenio.utils.url import make_invenio_opener
from invenio.utils.json import json
from invenio.config import CFG_SOLR_URL, \
                           CFG_WEBSEARCH_FULLTEXT_SNIPPETS, \
                           CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS


if CFG_SOLR_URL:
    import solr
    SOLR_CONNECTION = solr.SolrConnection(CFG_SOLR_URL) # pylint: disable=E1101


SOLRUTILS_OPENER = make_invenio_opener('solrutils')

def solr_get_facets(bitset, solr_url):
    facet_query_url = "%s/invenio_facets" % solr_url
    # now use the bitset to fetch the facet data
    r = urllib2.Request(facet_query_url)
    data = bitset.fastdump()
    boundary = mimetools.choose_boundary()

    # fool solr into thinking we're uploading a file so it will read our data as a stream
    contents = '--%s\r\n' % boundary
    contents += 'Content-Disposition: form-data; name="bitset"; filename="bitset"\r\n'
    contents += 'Content-Type: application/octet-stream\r\n'
    contents += '\r\n' + data + '\r\n'
    contents += '--%s--\r\n\r\n' % boundary
    r.add_data(contents)