示例#1
0
 def setUp(self):
     """Setup initial document."""
     self.els = ElsevierPackage(CONSYN=True,
                                journal_mappings=journal_mappings)
     self.document = parse(
         pkg_resources.resource_filename(
             'harvestingkit.tests',
             os.path.join('data', 'sample_consyn_record.xml')))
示例#2
0
def call_elsevier(settings):
    if settings.update_credentials:
        _query_user_for_credentials(['ELSEVIER'])
        return

    elsevier_package = ElsevierPackage(
        package_name=settings.package_name,
        path=settings.path,
        run_locally=settings.run_locally,
        extract_nations=settings.extract_nations)
    elsevier_package.bibupload_it()
 def setUp(self):
     """Setup initial document."""
     self.els = ElsevierPackage(no_harvest=True)
     self.document = parse(
         pkg_resources.resource_filename(
             'harvestingkit.tests',
             os.path.join('data', 'sample_elsevier_document_output.xml')))
     self.document540 = parse(
         pkg_resources.resource_filename(
             'harvestingkit.tests',
             os.path.join('data',
                          'sample_elsevier_540_document_output.xml')))
示例#4
0
def bst_elsevier():
    els = ElsevierPackage()
    run(els, els.conn.packages_delivery, els.doi_package_name_mapping)
示例#5
0
def bst_consyn_harvest(feed=None,
                       package=None,
                       package_list=None,
                       batch_size='500',
                       delete_zip='False',
                       upload_FTP='True'):
    """ Task to convert xml files from consyn.elsevier.com to Marc xml files.
    There are three execution modes:
    1. Download from an atom feed.
    2. Extract a zip package.
    3. Extract a list of zip packages.

    :param feed: The URL of the atom feed to download.
    :type feed: string

    :param package: A path to a zip package
    :type package: string

    :param package_list: A path to a file with a list of paths to zip packages
    :type package_list: string

    :param batch_size: The number of records contained in each output file
    :type batch_size: string representation of an integer

    :param delete_zip: Flag to indicate if the downloaded zip files
                       should be kept on the disk or not
    :type delete_zip: string representation of a boolean

    :param upload_FTP: Flag to indicate whether the result files
                       should be uploaded to the FTP server
    :type upload_FTP: string representation of a boolean
    """
    if not feed:
        feed = "https://consyn.elsevier.com/batch/atom?key=%s" % \
               (CFG_CONSYN_ATOM_KEY,)
    new_files = []
    new_sources = []

    try:
        batch_size = int(batch_size)
    except ValueError:
        batch_size = 500
        write_message('Warning batch_size parameter is not a valid integer\n' +
                      'the default value \'500\' has been used!\n')
    if delete_zip.lower() == 'true':
        delete_zip = True
    elif delete_zip.lower() == 'false':
        delete_zip = False
    else:
        delete_zip = False
        write_message(
            'Warning delete_zip parameter is not a valid Boolean (True/False)\n'
            + 'the default value \'False\' has been used!\n')
    if upload_FTP.lower() == 'true':
        upload_FTP = True
    elif upload_FTP.lower() == 'false':
        upload_FTP = False
    else:
        upload_FTP = True
        write_message(
            'Warning upload_FTP parameter is not a valid Boolean (True/False)\n'
            + 'the default value \'True\' has been used!\n')

    if not exists(CFG_CONSYN_OUT_DIRECTORY):
        rmdir(create_work_folder(CFG_CONSYN_OUT_DIRECTORY))
    out_folder = CFG_CONSYN_OUT_DIRECTORY
    els = ElsevierPackage(CONSYN=True)

    consyn_files = join(out_folder, "consyn-files")
    consyn_files = consyn_files.lstrip()

    if not package and not package_list:
        download_feed(feed, batch_size, delete_zip, new_sources, out_folder)
        task_update_progress("Converting files 2/3...")
        task_sleep_now_if_required(can_stop_too=True)
        fetch_xml_files(consyn_files, els, new_files)
        task_sleep_now_if_required(can_stop_too=False)
    else:
        xml_files = []
        if package:
            xml_files = extract_package(package, batch_size, delete_zip,
                                        out_folder)
        elif package_list:
            xml_files = extract_multiple_packages(package_list, batch_size,
                                                  delete_zip, new_sources,
                                                  out_folder)
            task_update_progress("Converting files 2/3...")
            results = convert_files(xml_files, els, prefix=consyn_files)

            for dummy, (status_code, result) in results.iteritems():
                if status_code == StatusCodes.OK:
                    new_files.append(result)
    task_update_progress("Compiling output 3/3...")
    create_collection(batch_size, new_files, new_sources, out_folder,
                      upload_FTP)
示例#6
0
def bst_consyn_harvest(feed_url=None,
                       package=None,
                       feed_file=None,
                       package_list_file=None,
                       batch_size='500',
                       delete_zip='False',
                       submit='False',
                       threshold_date=None):
    """ Task to convert xml files from consyn.elsevier.com to Marc xml files.
    There are four execution modes:
    1. Download from an atom feed url.
    2. Extract and convert a zip package.
    3. Download from an atom feed file.
    4. Extract and convert a list of zip packages.

    The feed is stored to the file system under the folder feeds.
    If no errors occur during the execution of the tasklet the feed
    is deleted. Records may be recovered running the tasklet again with
    the modes 2, 3 or 4.

    :param feed_url: A URL to the atom feed.
    :type feed: string.

    :param package: A path to a zip package.
    :type package: string.

    :param package: A path to an atom feed file.
    :type package: string.

    :param package_list_file: A path to a file with a list of paths
                              to zip packages. The file must contain
                              the path to each package in a different
                              line.
    :type package_list_file: string.

    :param batch_size: The number of records contained in each output file.
    :type batch_size: string representation of an integer.

    :param delete_zip: Flag to indicate if the downloaded zip files
                       should be kept on the disk or not.
    :type delete_zip: string representation of a boolean.

    :param submit: Flag to indicate whether the result files
                       should be submited by email and uploaded
                       to FTP server.
    :type submit: string representation of a boolean.
    :param threshold_date: threshold date only converts records that they were
                      published after threshold_date
    :type threshold_date: string in the format YYYY-MM-DD
    """
    if not feed_url:
        feed_url = "https://consyn.elsevier.com/batch/atom?key=%s" % \
                   (CFG_CONSYN_ATOM_KEY,)
    new_files = []
    new_sources = []
    feed_location = ''

    try:
        batch_size = int(batch_size)
    except ValueError:
        batch_size = 500
        write_message('Warning batch_size parameter is not a valid integer\n'
                      'the default value \'500\' has been used!\n')
    if delete_zip.lower() == 'true':
        delete_zip = True
    elif delete_zip.lower() == 'false':
        delete_zip = False
    else:
        delete_zip = False
        write_message('Warning delete_zip parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if submit.lower() == 'true':
        submit = True
    elif submit.lower() == 'false':
        submit = False
    else:
        submit = False
        write_message('Warning upload_FTP parameter is not'
                      ' a valid Boolean (True/False)\n'
                      'the default value \'False\' has been used!\n')
    if threshold_date:
        import time
        date_format = "%Y-%m-%d"
        try:
            date = datetime(*(time.strptime(threshold_date, date_format)[0:6]))
            threshold_date = date.strftime('%Y-%m-%d')
        except ValueError:
            write_message('Error threshold_date parameter is not '
                          'in the right format. It should be in '
                          'form "YYYY-MM-DD".')
            task_update_status("ERROR")
            return

    if not exists(CFG_CONSYN_OUT_DIRECTORY):
        makedirs(CFG_CONSYN_OUT_DIRECTORY)
    out_folder = CFG_CONSYN_OUT_DIRECTORY
    journal_mappings = get_kbs()['journals'][1]
    els = ElsevierPackage(CONSYN=True, journal_mappings=journal_mappings)

    consyn_files = join(out_folder, "consyn-files")
    consyn_files = consyn_files.lstrip()

    if package:
        xml_files = extract_package(package, delete_zip, out_folder,
                                    new_sources)
    elif package_list_file:
        package_list = []
        with open(package_list_file, 'r') as package_file:
            for line in package_file:
                line = line.strip()
                if line:
                    package_list.append(line)
        xml_files = extract_multiple_packages(package_list, delete_zip,
                                              new_sources, out_folder)
    elif feed_file:
        entries = parse_feed(feed_file)
        links = [a[0] for a in entries]
        package_list = [a[1] for a in entries]
        package_list = [
            join(CFG_CONSYN_OUT_DIRECTORY, a) for a in package_list
        ]
        for package in package_list:
            task_sleep_now_if_required()
            if not exists(package):
                index = package_list.index(package)
                link = links[index]
                link = link.replace(' ', '%20')
                try:
                    message = ("Downloading %s to %s\n" % (link, package))
                    write_message(message)
                    download_url(link, "zip", package, 5, 60.0)
                    package_list.append(package)
                except InvenioFileDownloadError as err:
                    message = "URL could not be opened: " + link
                    write_message(message)
                    write_message(str(err))
                    write_message(traceback.format_exc()[:-1])
                    task_update_status("CERROR")
                    continue
            xml_files = extract_multiple_packages(package_list, delete_zip,
                                                  new_sources, out_folder)
    else:
        feeds_folder = join(CFG_CONSYN_OUT_DIRECTORY, 'feeds')
        if not exists(feeds_folder):
            makedirs(feeds_folder)
        date = datetime.now().strftime("%Y.%m.%d")
        feed_location = "feed-%s.xml" % date
        feed_location = join(feeds_folder, feed_location)
        xml_files = download_feed(feed_url, delete_zip, new_sources,
                                  out_folder, feed_location)
    task_update_progress("Converting files 2/3...")
    task_sleep_now_if_required()
    results = convert_files(xml_files,
                            els,
                            prefix=consyn_files,
                            threshold_date=threshold_date)
    for dummy, (status_code, result) in results.iteritems():
        if status_code == StatusCodes.OK:
            new_files.append(result)
    task_update_progress("Compiling output 3/3...")
    task_sleep_now_if_required()
    create_collection(batch_size, new_files, new_sources, out_folder, submit)
    if feed_location and not _errors_detected:
        remove(feed_location)
    for error in _errors_detected:
        write_message(str(error))
def bst_consyn_harvest(feed=None, package=None, package_list=None,
                       batch_size='500', delete_zip='False'):
    """
    Task to convert xml files from consyn.elsevier.com to marc xml files.
    There are three excecution modes:
    1. Download from an atom feed.
    2. Extract a zip package.
    3. Extract a list of zip packages.

    @param feed: The URL of the atom feed to download.
    @type feed: string

    @param package: A path to a zip package
    @type package: string

    @param package_list: A path to a file with a list of paths to zip packages
    @type package_list: string

    @param batch_size: The number of records contained in each output file
    @type batch_size: int

    @param delete_zip: Flag to indicate if the downloaded zip files
                       should be kept on the disk or not
    @type delete_zip: boolean
    """
    if not feed:
        feed = "https://consyn.elsevier.com/batch/atom?key=%s" % \
               (CFG_CONSYN_ATOM_KEY,)
    new_files = []
    new_sources = []

    try:
        batch_size = int(batch_size)
    except ValueError:
        batch_size = 500
        write_message('Warning batch_size parameter is not a valid integer\n' +
                      'the default value \'500\' has been used!\n')
    if delete_zip.lower() == 'true':
        delete_zip = True
    elif delete_zip.lower() == 'false':
        delete_zip = False
    else:
        delete_zip = False
        write_message('Warning delete_zip parameter is not a valid Boolean (True/False)\n' +
                      'the default value \'False\' has been used!\n')

    out_folder = create_work_folder(CFG_CONSYN_OUT_DIRECTORY)

    try:
        run_sql("SELECT filename FROM CONSYNHARVEST")
    except ProgrammingError:
        # Table missing, create it.
        run_sql("CREATE TABLE CONSYNHARVEST ("
                "filename VARCHAR(100) NOT NULL PRIMARY KEY,"
                "date VARCHAR(50),"
                "size VARCHAR(30) );")

    if not package and not package_list:
        download_feed(feed, batch_size, delete_zip, new_sources, out_folder)
    elif package:
        extract_package(package, batch_size, delete_zip, out_folder)
    elif package_list:
        extract_multiple_packages(package_list, batch_size,
                                  delete_zip, new_sources,
                                  out_folder)

    task_sleep_now_if_required(can_stop_too=True)
    consyn_files = join(out_folder, "consyn-files")
    consyn_files = consyn_files.lstrip()
    els = ElsevierPackage(path="whatever", CONSYN=True)
    task_update_progress("Converting files 2/2...")
    fetch_xml_files(consyn_files, els, new_files)
    task_sleep_now_if_required(can_stop_too=False)
    create_collection(batch_size, new_files, new_sources, out_folder)