def oai_harvest_get( prefix, baseurl, harvestpath, fro=None, until=None, setspecs=None, user=None, password=None, cert_file=None, key_file=None, method="POST", ): """ Retrieve OAI records from given repository, with given arguments """ try: (addressing_scheme, network_location, path, parameters, query, fragment_identifier) = urlparse.urlparse(baseurl) secure = addressing_scheme == "https" http_param_dict = {"verb": "ListRecords", "metadataPrefix": prefix} if fro: http_param_dict["from"] = fro if until: http_param_dict["until"] = until sets = None if setspecs: sets = [set.strip() for set in setspecs.split(" ")] print "Start harvesting" oai_harvest_getter.harvest( network_location, path, http_param_dict, method, harvestpath, sets, secure, user, password, cert_file, key_file, ) harvest_dir, harvest_filename = os.path.split(harvestpath) files = os.listdir(harvest_dir) files.sort() harvested_files = [ harvest_dir + os.sep + filename for filename in files if filename.startswith(harvest_filename) ] return (1, harvested_files) except StandardError, e: print e return (0, e)
def oai_harvest_get(prefix, baseurl, harvestpath, fro=None, until=None, setspecs=None, user=None, password=None, cert_file=None, key_file=None, method="POST"): """ Retrieve OAI records from given repository, with given arguments """ try: (addressing_scheme, network_location, path, parameters, \ query, fragment_identifier) = urlparse.urlparse(baseurl) secure = (addressing_scheme == "https") http_param_dict = {'verb': "ListRecords", 'metadataPrefix': prefix} if fro: http_param_dict['from'] = fro if until: http_param_dict['until'] = until sets = None if setspecs: sets = [set.strip() for set in setspecs.split(' ')] print "Start harvesting" oai_harvest_getter.harvest(network_location, path, http_param_dict, method, harvestpath, sets, secure, user, password, cert_file, key_file) harvest_dir, harvest_filename = os.path.split(harvestpath) files = os.listdir(harvest_dir) files.sort() harvested_files = [harvest_dir + os.sep + filename for \ filename in files \ if filename.startswith(harvest_filename)] return (1, harvested_files) except StandardError, e: print e return (0, e)
def main(): """Starts the tool. If the command line arguments are those of the 'manual' mode, then starts a manual one-time harvesting. Else trigger a BibSched task for automated harvesting based on the OAIHarvest admin settings. """ # Let's try to parse the arguments as used in manual harvesting: try: opts, args = getopt.getopt(sys.argv[1:], "o:v:m:p:i:s:f:u:r:x:c:k:w:l:", ["output=", "verb=", "method=", "metadataPrefix=", "identifier=", "set=", "from=", "until=", "resumptionToken=", "certificate=", "key=", "user="******"password="******"POST" output = "" user = None password = None cert_file = None key_file = None sets = [] # get options and arguments for opt, opt_value in opts: if opt in ["-v", "--verb"]: http_param_dict['verb'] = opt_value elif opt in ["-m", '--method']: if opt_value == "GET" or opt_value == "POST": method = opt_value elif opt in ["-p", "--metadataPrefix"]: http_param_dict['metadataPrefix'] = opt_value elif opt in ["-i", "--identifier"]: http_param_dict['identifier'] = opt_value elif opt in ["-s", "--set"]: sets = opt_value.split() elif opt in ["-f", "--from"]: http_param_dict['from'] = opt_value elif opt in ["-u", "--until"]: http_param_dict['until'] = opt_value elif opt in ["-r", "--resumptionToken"]: http_param_dict['resumptionToken'] = opt_value elif opt in ["-o", "--output"]: output = opt_value elif opt in ["-c", "--certificate"]: cert_file = opt_value elif opt in ["-k", "--key"]: key_file = opt_value elif opt in ["-l", "--user"]: user = opt_value elif opt in ["-w", "--password"]: password = opt_value elif opt in ["-V", "--version"]: print __revision__ sys.exit(0) else: usage(1, "Option %s is not allowed" % opt) if len(args) > 0: base_url = args[-1] if not base_url.lower().startswith('http'): base_url = 'http://' + base_url (addressing_scheme, network_location, path, parameters, \ query, fragment_identifier) = urlparse.urlparse(base_url) secure = (addressing_scheme == "https") if (cert_file and not key_file) or \ (key_file and not cert_file): # Both are needed if one specified usage(1, "You must specify both certificate and key files") if password and not user: # User must be specified when password is given usage(1, "You must specify a username") elif user and not password: if not secure: sys.stderr.write("*WARNING* Your password will be sent in clear!\n") try: password = getpass.getpass() except KeyboardInterrupt, e: sys.stderr.write("\n") sys.exit(0) oai_harvest_getter.harvest(network_location, path, http_param_dict, method, output, sets, secure, user, password, cert_file, key_file) sys.stderr.write("Harvesting completed at: %s\n\n" % time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) return else: usage(1, "You must specify the URL to harvest") else:
def main(): """Starts the tool. If the command line arguments are those of the 'manual' mode, then starts a manual one-time harvesting. Else trigger a BibSched task for automated harvesting based on the OAIHarvest admin settings. """ # Let's try to parse the arguments as used in manual harvesting: try: opts, args = getopt.getopt( sys.argv[1:], "o:v:m:p:i:s:f:u:r:x:c:k:w:l:", [ "output=", "verb=", "method=", "metadataPrefix=", "identifier=", "set=", "from=", "until=", "resumptionToken=", "certificate=", "key=", "user="******"password="******"POST" output = "" user = None password = None cert_file = None key_file = None sets = [] # get options and arguments for opt, opt_value in opts: if opt in ["-v", "--verb"]: http_param_dict['verb'] = opt_value elif opt in ["-m", '--method']: if opt_value == "GET" or opt_value == "POST": method = opt_value elif opt in ["-p", "--metadataPrefix"]: http_param_dict['metadataPrefix'] = opt_value elif opt in ["-i", "--identifier"]: http_param_dict['identifier'] = opt_value elif opt in ["-s", "--set"]: sets = opt_value.split() elif opt in ["-f", "--from"]: http_param_dict['from'] = opt_value elif opt in ["-u", "--until"]: http_param_dict['until'] = opt_value elif opt in ["-r", "--resumptionToken"]: http_param_dict['resumptionToken'] = opt_value elif opt in ["-o", "--output"]: output = opt_value elif opt in ["-c", "--certificate"]: cert_file = opt_value elif opt in ["-k", "--key"]: key_file = opt_value elif opt in ["-l", "--user"]: user = opt_value elif opt in ["-w", "--password"]: password = opt_value elif opt in ["-V", "--version"]: print __revision__ sys.exit(0) else: usage(1, "Option %s is not allowed" % opt) if len(args) > 0: base_url = args[-1] if not base_url.lower().startswith('http'): base_url = 'http://' + base_url (addressing_scheme, network_location, path, parameters, \ query, fragment_identifier) = urlparse.urlparse(base_url) secure = (addressing_scheme == "https") if (cert_file and not key_file) or \ (key_file and not cert_file): # Both are needed if one specified usage(1, "You must specify both certificate and key files") if password and not user: # User must be specified when password is given usage(1, "You must specify a username") elif user and not password: if not secure: sys.stderr.write( "*WARNING* Your password will be sent in clear!\n") try: password = getpass.getpass() except KeyboardInterrupt, e: sys.stderr.write("\n") sys.exit(0) oai_harvest_getter.harvest(network_location, path, http_param_dict, method, output, sets, secure, user, password, cert_file, key_file) sys.stderr.write( "Harvesting completed at: %s\n\n" % time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime())) return else: usage(1, "You must specify the URL to harvest") else: