Exemplo n.º 1
0
def oai_harvest_get(
    prefix,
    baseurl,
    harvestpath,
    fro=None,
    until=None,
    setspecs=None,
    user=None,
    password=None,
    cert_file=None,
    key_file=None,
    method="POST",
):
    """
    Retrieve OAI records from given repository, with given arguments
    """
    try:
        (addressing_scheme, network_location, path, parameters, query, fragment_identifier) = urlparse.urlparse(baseurl)
        secure = addressing_scheme == "https"

        http_param_dict = {"verb": "ListRecords", "metadataPrefix": prefix}
        if fro:
            http_param_dict["from"] = fro
        if until:
            http_param_dict["until"] = until
        sets = None
        if setspecs:
            sets = [set.strip() for set in setspecs.split(" ")]

        print "Start harvesting"
        oai_harvest_getter.harvest(
            network_location,
            path,
            http_param_dict,
            method,
            harvestpath,
            sets,
            secure,
            user,
            password,
            cert_file,
            key_file,
        )

        harvest_dir, harvest_filename = os.path.split(harvestpath)
        files = os.listdir(harvest_dir)
        files.sort()
        harvested_files = [
            harvest_dir + os.sep + filename for filename in files if filename.startswith(harvest_filename)
        ]

        return (1, harvested_files)
    except StandardError, e:
        print e
        return (0, e)
def oai_harvest_get(prefix,
                    baseurl,
                    harvestpath,
                    fro=None,
                    until=None,
                    setspecs=None,
                    user=None,
                    password=None,
                    cert_file=None,
                    key_file=None,
                    method="POST"):
    """
    Retrieve OAI records from given repository, with given arguments
    """
    try:
        (addressing_scheme, network_location, path, parameters, \
         query, fragment_identifier) = urlparse.urlparse(baseurl)
        secure = (addressing_scheme == "https")

        http_param_dict = {'verb': "ListRecords", 'metadataPrefix': prefix}
        if fro:
            http_param_dict['from'] = fro
        if until:
            http_param_dict['until'] = until
        sets = None
        if setspecs:
            sets = [set.strip() for set in setspecs.split(' ')]

        print "Start harvesting"
        oai_harvest_getter.harvest(network_location, path, http_param_dict,
                                   method, harvestpath, sets, secure, user,
                                   password, cert_file, key_file)

        harvest_dir, harvest_filename = os.path.split(harvestpath)
        files = os.listdir(harvest_dir)
        files.sort()
        harvested_files = [harvest_dir + os.sep + filename for \
                           filename in files \
                           if filename.startswith(harvest_filename)]

        return (1, harvested_files)
    except StandardError, e:
        print e
        return (0, e)
Exemplo n.º 3
0
def main():
    """Starts the tool.

    If the command line arguments are those of the 'manual' mode, then
    starts a manual one-time harvesting. Else trigger a BibSched task
    for automated harvesting based on the OAIHarvest admin settings.
    """

    # Let's try to parse the arguments as used in manual harvesting:
    try:
        opts, args = getopt.getopt(sys.argv[1:], "o:v:m:p:i:s:f:u:r:x:c:k:w:l:",
                                   ["output=",
                                    "verb=",
                                    "method=",
                                    "metadataPrefix=",
                                    "identifier=",
                                    "set=",
                                    "from=",
                                    "until=",
                                    "resumptionToken=",
                                    "certificate=",
                                    "key=",
                                    "user="******"password="******"POST"
            output                 = ""
            user                   = None
            password               = None
            cert_file              = None
            key_file               = None
            sets = []

            # get options and arguments
            for opt, opt_value in opts:
                if   opt in ["-v", "--verb"]:
                    http_param_dict['verb']             = opt_value
                elif opt in ["-m", '--method']:
                    if opt_value == "GET" or opt_value == "POST":
                        method                          = opt_value
                elif opt in ["-p", "--metadataPrefix"]:
                    http_param_dict['metadataPrefix']   = opt_value
                elif opt in ["-i", "--identifier"]:
                    http_param_dict['identifier']       = opt_value
                elif opt in ["-s", "--set"]:
                    sets                                = opt_value.split()
                elif opt in ["-f", "--from"]:
                    http_param_dict['from']             = opt_value
                elif opt in ["-u", "--until"]:
                    http_param_dict['until']            = opt_value
                elif opt in ["-r", "--resumptionToken"]:
                    http_param_dict['resumptionToken']  = opt_value
                elif opt in ["-o", "--output"]:
                    output                              = opt_value
                elif opt in ["-c", "--certificate"]:
                    cert_file                           = opt_value
                elif opt in ["-k", "--key"]:
                    key_file                            = opt_value
                elif opt in ["-l", "--user"]:
                    user                                = opt_value
                elif opt in ["-w", "--password"]:
                    password                            = opt_value
                elif opt in ["-V", "--version"]:
                    print __revision__
                    sys.exit(0)
                else:
                    usage(1, "Option %s is not allowed" % opt)

            if len(args) > 0:
                base_url = args[-1]
                if not base_url.lower().startswith('http'):
                    base_url = 'http://' + base_url
                (addressing_scheme, network_location, path, parameters, \
                 query, fragment_identifier) = urlparse.urlparse(base_url)
                secure = (addressing_scheme == "https")

                if (cert_file and not key_file) or \
                   (key_file and not cert_file):
                    # Both are needed if one specified
                    usage(1, "You must specify both certificate and key files")

                if password and not user:
                    # User must be specified when password is given
                    usage(1, "You must specify a username")
                elif user and not password:
                    if not secure:
                        sys.stderr.write("*WARNING* Your password will be sent in clear!\n")
                    try:
                        password = getpass.getpass()
                    except KeyboardInterrupt, e:
                        sys.stderr.write("\n")
                        sys.exit(0)

                oai_harvest_getter.harvest(network_location, path,
                                           http_param_dict, method,
                                           output, sets, secure, user,
                                           password, cert_file,
                                           key_file)

                sys.stderr.write("Harvesting completed at: %s\n\n" %
                    time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
                return
            else:
                usage(1, "You must specify the URL to harvest")
        else:
def main():
    """Starts the tool.

    If the command line arguments are those of the 'manual' mode, then
    starts a manual one-time harvesting. Else trigger a BibSched task
    for automated harvesting based on the OAIHarvest admin settings.
    """

    # Let's try to parse the arguments as used in manual harvesting:
    try:
        opts, args = getopt.getopt(
            sys.argv[1:], "o:v:m:p:i:s:f:u:r:x:c:k:w:l:", [
                "output=", "verb=", "method=", "metadataPrefix=",
                "identifier=", "set=", "from=", "until=", "resumptionToken=",
                "certificate=", "key=", "user="******"password="******"POST"
            output = ""
            user = None
            password = None
            cert_file = None
            key_file = None
            sets = []

            # get options and arguments
            for opt, opt_value in opts:
                if opt in ["-v", "--verb"]:
                    http_param_dict['verb'] = opt_value
                elif opt in ["-m", '--method']:
                    if opt_value == "GET" or opt_value == "POST":
                        method = opt_value
                elif opt in ["-p", "--metadataPrefix"]:
                    http_param_dict['metadataPrefix'] = opt_value
                elif opt in ["-i", "--identifier"]:
                    http_param_dict['identifier'] = opt_value
                elif opt in ["-s", "--set"]:
                    sets = opt_value.split()
                elif opt in ["-f", "--from"]:
                    http_param_dict['from'] = opt_value
                elif opt in ["-u", "--until"]:
                    http_param_dict['until'] = opt_value
                elif opt in ["-r", "--resumptionToken"]:
                    http_param_dict['resumptionToken'] = opt_value
                elif opt in ["-o", "--output"]:
                    output = opt_value
                elif opt in ["-c", "--certificate"]:
                    cert_file = opt_value
                elif opt in ["-k", "--key"]:
                    key_file = opt_value
                elif opt in ["-l", "--user"]:
                    user = opt_value
                elif opt in ["-w", "--password"]:
                    password = opt_value
                elif opt in ["-V", "--version"]:
                    print __revision__
                    sys.exit(0)
                else:
                    usage(1, "Option %s is not allowed" % opt)

            if len(args) > 0:
                base_url = args[-1]
                if not base_url.lower().startswith('http'):
                    base_url = 'http://' + base_url
                (addressing_scheme, network_location, path, parameters, \
                 query, fragment_identifier) = urlparse.urlparse(base_url)
                secure = (addressing_scheme == "https")

                if (cert_file and not key_file) or \
                   (key_file and not cert_file):
                    # Both are needed if one specified
                    usage(1, "You must specify both certificate and key files")

                if password and not user:
                    # User must be specified when password is given
                    usage(1, "You must specify a username")
                elif user and not password:
                    if not secure:
                        sys.stderr.write(
                            "*WARNING* Your password will be sent in clear!\n")
                    try:
                        password = getpass.getpass()
                    except KeyboardInterrupt, e:
                        sys.stderr.write("\n")
                        sys.exit(0)

                oai_harvest_getter.harvest(network_location, path,
                                           http_param_dict, method, output,
                                           sets, secure, user, password,
                                           cert_file, key_file)

                sys.stderr.write(
                    "Harvesting completed at: %s\n\n" %
                    time.strftime("%Y-%m-%d %H:%M:%S --> ", time.localtime()))
                return
            else:
                usage(1, "You must specify the URL to harvest")
        else: