def oai_harvest_get(prefix, baseurl, harvestpath, fro=None, until=None, setspecs=None, user=None, password=None, cert_file=None, key_file=None, method="POST", verb="ListRecords", identifier=""): """ Retrieve OAI records from given repository, with given arguments :param prefix: :param baseurl: :param harvestpath: :param fro: :param until: :param setspecs: :param user: :param password: :param cert_file: :param key_file: :param method: :param verb: :param identifier: """ try: (addressing_scheme, network_location, path, dummy1, dummy2, dummy3) = urlparse.urlparse(baseurl) secure = (addressing_scheme == "https") http_param_dict = {'verb': verb, 'metadataPrefix': prefix} if identifier: http_param_dict['identifier'] = identifier if fro: http_param_dict['from'] = fro if until: http_param_dict['until'] = until sets = None if setspecs: sets = [oai_set.strip() for oai_set in setspecs.split(' ')] harvested_files = getter.harvest(network_location, path, http_param_dict, method, harvestpath, sets, secure, user, password, cert_file, key_file) return harvested_files except (StandardError, getter.InvenioOAIRequestError) as exce: register_exception() raise Exception("An error occurred while harvesting from %s: %s\n" % (baseurl, str(exce)))
def oai_harvest_get(prefix, baseurl, harvestpath, fro=None, until=None, setspecs=None, user=None, password=None, cert_file=None, key_file=None, method="POST", verb="ListRecords", identifier=""): """ Retrieve OAI records from given repository, with given arguments :param prefix: :param baseurl: :param harvestpath: :param fro: :param until: :param setspecs: :param user: :param password: :param cert_file: :param key_file: :param method: :param verb: :param identifier: """ try: (addressing_scheme, network_location, path, dummy1, dummy2, dummy3) = urlparse.urlparse(baseurl) secure = (addressing_scheme == "https") http_param_dict = {'verb': verb, 'metadataPrefix': prefix} if identifier: http_param_dict['identifier'] = identifier if fro: http_param_dict['from'] = fro if until: http_param_dict['until'] = until sets = None if setspecs: sets = [oai_set.strip() for oai_set in setspecs.split(' ')] harvested_files = getter.harvest(network_location, path, http_param_dict, method, harvestpath, sets, secure, user, password, cert_file, key_file) return harvested_files except (StandardError, getter.InvenioOAIRequestError) as exce: register_exception() raise Exception("An error occurred while harvesting from %s: %s\n" % (baseurl, str(exce)))
def main(): """Start the tool. If the command line arguments are those of the 'manual' mode, then starts a manual one-time harvesting. Else trigger a BibSched task for automated harvesting based on the OAIHarvest admin settings. """ # Let's try to parse the arguments as used in manual harvesting: try: opts, args = getopt.getopt(sys.argv[1:], "o:v:m:p:i:s:f:u:r:c:k:l:w:", ["output=", "verb=", "method=", "metadataPrefix=", "identifier=", "set=", "from=", "until=", "resumptionToken=", "certificate=", "key=", "user="******"password="******"workflow=", ]) # So everything went smoothly: start harvesting in manual mode if len([opt for opt, opt_value in opts if opt in ['-v', '--verb']]) > 0: # verb parameter is given http_param_dict = {} method = "POST" output = "" user = None password = None cert_file = None key_file = None sets = [] # get options and arguments for opt, opt_value in opts: if opt in ["-v", "--verb"]: http_param_dict['verb'] = opt_value elif opt in ["-m", '--method']: if opt_value == "GET" or opt_value == "POST": method = opt_value elif opt in ["-p", "--metadataPrefix"]: http_param_dict['metadataPrefix'] = opt_value elif opt in ["-i", "--identifier"]: http_param_dict['identifier'] = opt_value elif opt in ["-s", "--set"]: sets = opt_value.split() elif opt in ["-f", "--from"]: http_param_dict['from'] = opt_value elif opt in ["-u", "--until"]: http_param_dict['until'] = opt_value elif opt in ["-r", "--resumptionToken"]: http_param_dict['resumptionToken'] = opt_value elif opt in ["-o", "--output"]: output = opt_value elif opt in ["-c", "--certificate"]: cert_file = opt_value elif opt in ["-k", "--key"]: key_file = opt_value elif opt in ["-l", "--user"]: user = opt_value elif opt in ["-w", "--password"]: password = opt_value elif opt in ["-V", "--version"]: print(__revision__) sys.exit(0) else: usage(1, "Option %s is not allowed" % opt) if len(args) > 0: base_url = args[-1] if not base_url.lower().startswith('http'): base_url = 'http://' + base_url (addressing_scheme, network_location, path, dummy1, dummy2, dummy3) = urllib.parse.urlparse(base_url) secure = (addressing_scheme == "https") if (cert_file and not key_file) or \ (key_file and not cert_file): # Both are needed if one specified usage(1, "You must specify both certificate and key files") if password and not user: # User must be specified when password is given usage(1, "You must specify a username") elif user and not password: if not secure: sys.stderr.write( "*WARNING* Your password will be sent in clear!\n") try: password = getpass.getpass() except KeyboardInterrupt as error: sys.stderr.write("\n%s\n" % (error,)) sys.exit(0) getter.harvest(network_location, path, http_param_dict, method, output, sets, secure, user, password, cert_file, key_file) sys.stderr.write("Harvesting completed at: %s\n\n" % time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) return else: usage(1, "You must specify the URL to harvest") else: # verb is not given. We will continue with periodic # harvesting. But first check if URL parameter is given: # if it is, then warn directly now if len([opt for opt, opt_value in opts if opt in ['-i', '--identifier']]) == 0 \ and len(args) > 1 or \ (len(args) == 1 and not args[0].isdigit()): usage(1, "You must specify the --verb parameter") except getopt.error: # So could it be that we are using different arguments? Try to # start the BibSched task (automated harvesting) and see if it # validates pass # BibSched mode - periodical harvesting # Note that the 'help' is common to both manual and automated # mode. num_of_critical_parameter = 0 num_of_critical_parameterb = 0 repositories = [] for opt in sys.argv[1:]: if opt in "-r" or opt in "--repository": num_of_critical_parameter += 1 elif opt in "--workflow": num_of_critical_parameterb += 1 if num_of_critical_parameter > 1 or num_of_critical_parameterb > 1: usage(1, "You can't specify twice -r or --workflow") if num_of_critical_parameter == 1: if "-r" in sys.argv: position = sys.argv.index("-r") else: position = sys.argv.index("--repository") repositories = sys.argv[position + 1].split(",") if len(repositories) > 1 and \ ("-i" in sys.argv or "--identifier" in sys.argv): usage(1, "It is impossible to harvest an identifier from several " "repositories.") if num_of_critical_parameterb == 1: position = sys.argv.index("--workflow") workflows = sys.argv[position + 1].split(",") for workflow_candidate in workflows: if workflow_candidate not in registry_workflows: usage(1, "The workflow %s doesn't exist." % workflow_candidate) if num_of_critical_parameter == 1 and num_of_critical_parameterb == 0: for name_repository in repositories: try: oaiharvest_instance = OaiHARVEST.get( OaiHARVEST.name == name_repository).one() if oaiharvest_instance.workflows not in registry_workflows: usage(1, "The repository %s doesn't have a valid workflow specified." % name_repository) except orm.exc.NoResultFound: usage(1, "The repository %s doesn't exist in our database." % name_repository) elif num_of_critical_parameter == 1 and num_of_critical_parameterb == 1: for name_repository in repositories: try: OaiHARVEST.get(OaiHARVEST.name == name_repository).one() except orm.exc.NoResultFound: usage(1, "The repository %s doesn't exist in our database." % name_repository) print("A workflow has been specified, overriding the repository one.") task_set_option("repository", None) task_set_option("dates", None) task_set_option("workflow", None) task_set_option("identifiers", None) task_init(authorization_action='runoaiharvest', authorization_msg="oaiharvest Task Submission", description=""" Harvest records from OAI sources. Manual vs automatic harvesting: - Manual harvesting retrieves records from the specified URL, with the specified OAI arguments. Harvested records are displayed on the standard output or saved to a file, but are not integrated into the repository. This mode is useful to 'play' with OAI repositories or to build special harvesting scripts. - Automatic harvesting relies on the settings defined in the OAI Harvest admin interface to periodically retrieve the repositories and sets to harvest. It also take care of harvesting only new or modified records. Records harvested using this mode are converted and integrated into the repository, according to the settings defined in the OAI Harvest admin interface. Examples: Manual (single-shot) harvesting mode: Save to /tmp/z.xml records from CDS added/modified between 2004-04-01 and 2004-04-02, in MARCXML: $ oaiharvest -vListRecords -f2004-04-01 -u2004-04-02 -pmarcxml -o/tmp/z.xml http://cds.cern.ch/oai2d Automatic (periodical) harvesting mode: Schedule daily harvesting of all repositories defined in OAIHarvest admin: $ oaiharvest -s 24h Schedule daily harvesting of repository 'arxiv', defined in OAIHarvest admin: $ oaiharvest -r arxiv -s 24h Harvest in 10 minutes from 'pubmed' repository records added/modified between 2005-05-05 and 2005-05-10: $ oaiharvest -r pubmed -d 2005-05-05:2005-05-10 -t 10m """, help_specific_usage='Manual single-shot harvesting mode:\n' ' -o, --output specify output file\n' ' -v, --verb OAI verb to be executed\n' ' -m, --method http method (default POST)\n' ' -p, --metadataPrefix metadata format\n' ' -i, --identifier OAI identifier\n' ' -s, --set OAI set(s). Whitespace-separated list\n' ' -r, --resuptionToken Resume previous harvest\n' ' -f, --from from date (datestamp)\n' ' -u, --until until date (datestamp)\n' ' -c, --certificate path to public certificate (in case of certificate-based harvesting)\n' ' -k, --key path to private key (in case of certificate-based harvesting)\n' ' -l, --user username (in case of password-protected harvesting)\n' ' -w, --password password (in case of password-protected harvesting)\n' 'Deamon mode (periodical or one-shot harvesting mode):\n' ' -r, --repository="repo A"[,"repo B"] \t which repositories to harvest (default=all)\n' ' -d, --dates=yyyy-mm-dd:yyyy-mm-dd \t reharvest given dates only\n' ' -i, --identifier OAI identifier if wished to run in as a task.\n' ' --notify-email-to Receive notifications on given email on successful upload and/or finished harvest.\n' ' --workflow specify the workflow to execute.\n' ' --create-ticket-in Provide desired ticketing queue to create a ticket in it on upload and/or finished harvest.\n' ' Requires a configured ticketing system (BibCatalog).\n', specific_params=( "r:i:d:W", ["repository=", "identifier=", "dates=", "workflow=", "notify-email-to=", "create-ticket-in="]), task_submit_elaborate_specific_parameter_fnc=task_submit_elaborate_specific_parameter, task_run_fnc=task_run_core)
def main(): """Start the tool. If the command line arguments are those of the 'manual' mode, then starts a manual one-time harvesting. Else trigger a BibSched task for automated harvesting based on the OAIHarvest admin settings. """ # Let's try to parse the arguments as used in manual harvesting: try: opts, args = getopt.getopt(sys.argv[1:], "o:v:m:p:i:s:f:u:r:c:k:l:w:", [ "output=", "verb=", "method=", "metadataPrefix=", "identifier=", "set=", "from=", "until=", "resumptionToken=", "certificate=", "key=", "user="******"password="******"workflow=", ]) # So everything went smoothly: start harvesting in manual mode if len([opt for opt, opt_value in opts if opt in ['-v', '--verb']]) > 0: # verb parameter is given http_param_dict = {} method = "POST" output = "" user = None password = None cert_file = None key_file = None sets = [] # get options and arguments for opt, opt_value in opts: if opt in ["-v", "--verb"]: http_param_dict['verb'] = opt_value elif opt in ["-m", '--method']: if opt_value == "GET" or opt_value == "POST": method = opt_value elif opt in ["-p", "--metadataPrefix"]: http_param_dict['metadataPrefix'] = opt_value elif opt in ["-i", "--identifier"]: http_param_dict['identifier'] = opt_value elif opt in ["-s", "--set"]: sets = opt_value.split() elif opt in ["-f", "--from"]: http_param_dict['from'] = opt_value elif opt in ["-u", "--until"]: http_param_dict['until'] = opt_value elif opt in ["-r", "--resumptionToken"]: http_param_dict['resumptionToken'] = opt_value elif opt in ["-o", "--output"]: output = opt_value elif opt in ["-c", "--certificate"]: cert_file = opt_value elif opt in ["-k", "--key"]: key_file = opt_value elif opt in ["-l", "--user"]: user = opt_value elif opt in ["-w", "--password"]: password = opt_value elif opt in ["-V", "--version"]: print(__revision__) sys.exit(0) else: usage(1, "Option %s is not allowed" % opt) if len(args) > 0: base_url = args[-1] if not base_url.lower().startswith('http'): base_url = 'http://' + base_url (addressing_scheme, network_location, path, dummy1, dummy2, dummy3) = urllib.parse.urlparse(base_url) secure = (addressing_scheme == "https") if (cert_file and not key_file) or \ (key_file and not cert_file): # Both are needed if one specified usage(1, "You must specify both certificate and key files") if password and not user: # User must be specified when password is given usage(1, "You must specify a username") elif user and not password: if not secure: sys.stderr.write( "*WARNING* Your password will be sent in clear!\n") try: password = getpass.getpass() except KeyboardInterrupt as error: sys.stderr.write("\n%s\n" % (error, )) sys.exit(0) getter.harvest(network_location, path, http_param_dict, method, output, sets, secure, user, password, cert_file, key_file) sys.stderr.write( "Harvesting completed at: %s\n\n" % time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) return else: usage(1, "You must specify the URL to harvest") else: # verb is not given. We will continue with periodic # harvesting. But first check if URL parameter is given: # if it is, then warn directly now if len([opt for opt, opt_value in opts if opt in ['-i', '--identifier']]) == 0 \ and len(args) > 1 or \ (len(args) == 1 and not args[0].isdigit()): usage(1, "You must specify the --verb parameter") except getopt.error: # So could it be that we are using different arguments? Try to # start the BibSched task (automated harvesting) and see if it # validates pass # BibSched mode - periodical harvesting # Note that the 'help' is common to both manual and automated # mode. num_of_critical_parameter = 0 num_of_critical_parameterb = 0 repositories = [] for opt in sys.argv[1:]: if opt in "-r" or opt in "--repository": num_of_critical_parameter += 1 elif opt in "--workflow": num_of_critical_parameterb += 1 if num_of_critical_parameter > 1 or num_of_critical_parameterb > 1: usage(1, "You can't specify twice -r or --workflow") if num_of_critical_parameter == 1: if "-r" in sys.argv: position = sys.argv.index("-r") else: position = sys.argv.index("--repository") repositories = sys.argv[position + 1].split(",") if len(repositories) > 1 and \ ("-i" in sys.argv or "--identifier" in sys.argv): usage( 1, "It is impossible to harvest an identifier from several " "repositories.") if num_of_critical_parameterb == 1: position = sys.argv.index("--workflow") workflows = sys.argv[position + 1].split(",") for workflow_candidate in workflows: if workflow_candidate not in registry_workflows: usage(1, "The workflow %s doesn't exist." % workflow_candidate) if num_of_critical_parameter == 1 and num_of_critical_parameterb == 0: for name_repository in repositories: try: oaiharvest_instance = OaiHARVEST.get( OaiHARVEST.name == name_repository).one() if oaiharvest_instance.workflows not in registry_workflows: usage( 1, "The repository %s doesn't have a valid workflow specified." % name_repository) except orm.exc.NoResultFound: usage( 1, "The repository %s doesn't exist in our database." % name_repository) elif num_of_critical_parameter == 1 and num_of_critical_parameterb == 1: for name_repository in repositories: try: OaiHARVEST.get(OaiHARVEST.name == name_repository).one() except orm.exc.NoResultFound: usage( 1, "The repository %s doesn't exist in our database." % name_repository) print("A workflow has been specified, overriding the repository one.") task_set_option("repository", None) task_set_option("dates", None) task_set_option("workflow", None) task_set_option("identifiers", None) task_init( authorization_action='runoaiharvest', authorization_msg="oaiharvest Task Submission", description=""" Harvest records from OAI sources. Manual vs automatic harvesting: - Manual harvesting retrieves records from the specified URL, with the specified OAI arguments. Harvested records are displayed on the standard output or saved to a file, but are not integrated into the repository. This mode is useful to 'play' with OAI repositories or to build special harvesting scripts. - Automatic harvesting relies on the settings defined in the OAI Harvest admin interface to periodically retrieve the repositories and sets to harvest. It also take care of harvesting only new or modified records. Records harvested using this mode are converted and integrated into the repository, according to the settings defined in the OAI Harvest admin interface. Examples: Manual (single-shot) harvesting mode: Save to /tmp/z.xml records from CDS added/modified between 2004-04-01 and 2004-04-02, in MARCXML: $ oaiharvest -vListRecords -f2004-04-01 -u2004-04-02 -pmarcxml -o/tmp/z.xml http://cds.cern.ch/oai2d Automatic (periodical) harvesting mode: Schedule daily harvesting of all repositories defined in OAIHarvest admin: $ oaiharvest -s 24h Schedule daily harvesting of repository 'arxiv', defined in OAIHarvest admin: $ oaiharvest -r arxiv -s 24h Harvest in 10 minutes from 'pubmed' repository records added/modified between 2005-05-05 and 2005-05-10: $ oaiharvest -r pubmed -d 2005-05-05:2005-05-10 -t 10m """, help_specific_usage='Manual single-shot harvesting mode:\n' ' -o, --output specify output file\n' ' -v, --verb OAI verb to be executed\n' ' -m, --method http method (default POST)\n' ' -p, --metadataPrefix metadata format\n' ' -i, --identifier OAI identifier\n' ' -s, --set OAI set(s). Whitespace-separated list\n' ' -r, --resuptionToken Resume previous harvest\n' ' -f, --from from date (datestamp)\n' ' -u, --until until date (datestamp)\n' ' -c, --certificate path to public certificate (in case of certificate-based harvesting)\n' ' -k, --key path to private key (in case of certificate-based harvesting)\n' ' -l, --user username (in case of password-protected harvesting)\n' ' -w, --password password (in case of password-protected harvesting)\n' 'Deamon mode (periodical or one-shot harvesting mode):\n' ' -r, --repository="repo A"[,"repo B"] \t which repositories to harvest (default=all)\n' ' -d, --dates=yyyy-mm-dd:yyyy-mm-dd \t reharvest given dates only\n' ' -i, --identifier OAI identifier if wished to run in as a task.\n' ' --notify-email-to Receive notifications on given email on successful upload and/or finished harvest.\n' ' --workflow specify the workflow to execute.\n' ' --create-ticket-in Provide desired ticketing queue to create a ticket in it on upload and/or finished harvest.\n' ' Requires a configured ticketing system (BibCatalog).\n', specific_params=("r:i:d:W", [ "repository=", "identifier=", "dates=", "workflow=", "notify-email-to=", "create-ticket-in=" ]), task_submit_elaborate_specific_parameter_fnc= task_submit_elaborate_specific_parameter, task_run_fnc=task_run_core)