Exemplo n.º 1
0
def test_can_load_a_simple_void_file():
    m = util.load_file_into_model('tests/data/simple-void.ttl', 'turtle')
    p = void.parse_void_model(m)

    assert p == {   'http://lod.dataone.org/test': {
                        'dumps': ['http://lod.dataone.org/test.ttl'],
                        'features': [
                            'http://lod.dataone.org/fulldump'
                        ]
                    }
                }
Exemplo n.º 2
0
def main():
    args = sys.argv

    if len(args) < 2:
        print "Wrong number of arguments. Try `python parse_void.py http://someurl.com`"
        return

    # Decide format to use (default or provided)
    void_format = "rdfxml"

    if len(args) == 3 and len(args[2]) > 0:
        void_format = args[2]

    filepath = args[1]
    print "Getting VoID file at %s." % filepath

    # If web...
    if filepath.startswith("http"):
        r = requests.get(args[1])

        if r.status_code != 200:
            print "Status code was not 200. Was %d instead. Exiting." % r.status_code
            return

        void_text = r.text
    else:
        if not os.path.isfile(filepath):
            print "Couldn't find file locally and didn't think file was a web URL (%s). Exiting." % filepath

        void_text = None

        with open(filepath, 'rb') as f:
            void_text = f.read()

        if void_text is None:
            print "Failed to read in void file from %s. Exiting." % filepath

    model = util.load_string_into_model(void_text, void_format)
    parsed = void.parse_void_model(model)
    print parsed
Exemplo n.º 3
0
def update():
    """Update the contents of the Harvester.

    Reads the registry file and does work depending on what it finds.
    """

    JOB_NAME = "UPDATE"
    logging.info("[%s] Job started.", JOB_NAME)

    # Set up repository connection
    store = Store(SESAME_HOST, SESAME_PORT)
    repository = Repository(store, SESAME_REPOSITORY)

    # Establish the location of the registry file
    # The else case will run when not running in a container
    if os.path.isfile('/glharvest/registry.yml'):
        registry_filepath = '/glharvest/registry.yml'
    else:
        registry_filepath = 'registry.yml'

    logging.info("[%s] Loading registry file from location '%s'.", JOB_NAME, registry_filepath)

    if not os.path.isfile:
        raise Exception("Couldn't locate the registry file at the provided path: %s. Exiting.", registry_filepath)

    registry_file = registry.parse_registry_file(registry_filepath)
    logging.info("[%s] Registry file parsed. %d provider(s) found.", JOB_NAME, len(registry_file))

    for provider in registry_file:
        logging.info("[%s] Processing provider '%s'.", JOB_NAME, provider)

        # Check for VoID
        if 'void' not in registry_file[provider]:
            logging.error("[%s] Location of VoID dataset not found for this provider in the registry. Skipping.", JOB_NAME)
            continue

        voidfile = registry_file[provider]['void']

        # Check for modified value
        if 'modified' not in registry_file[provider]:
            logging.info("[%s] Modified date(time) was not found in the registry for provider '%s'. This will force harvesting for this provider.", JOB_NAME, provider)
            registry_modified = None
        else:
            registry_modified = registry_file[provider]['modified']
            registry_modified = parse(str(registry_modified), ignoretz=True)

        logging.info("[%s] Provider '%s' was last modified at '%s'.", JOB_NAME, provider, registry_modified)

        # Get and parse the VoID file
        logging.info("[%s] Attemting to retrieve VoID file from location %s.", JOB_NAME, voidfile)

        try:
            r = requests.get(voidfile)
        except:
            logging.error("[%s] Failed to get voidfile located at `%s`. Skipping.", JOB_NAME, voidfile)
            continue

        void_string_format = "rdfxml"

        # Use another format is we detect a different one
        if voidfile.endswith('ttl'):
            void_string_format = 'turtle'

        model = util.load_string_into_model(r.text, fmt=void_string_format)
        void_model = void.parse_void_model(model)

        for provider_dataset in void_model:
            logging.info("[%s] Processing provider VoID:Dataset '%s'.", JOB_NAME, provider_dataset)

            if 'modified' not in void_model[provider_dataset]:
                logging.error("[%s] VoID:Dataset found in VoID dump did not have dcterms:modified value. Skipping.", JOB_NAME)
                continue

            if 'features' not in void_model[provider_dataset]:
                logging.error("[%s] VoID:Dataset has no feature declaration. Skipping.", JOB_NAME)
                continue

            modified = void_model[provider_dataset]['modified']

            try:
                modified = parse(modified, ignoretz=True)
            except:
                logging.error("[%s] Failed to parse modified time string of %s. Skipping.", JOB_NAME, modified)
                continue

            # TODO process features
            features = void_model[provider_dataset]['features']
            logging.info("[%s] Found features: %s", JOB_NAME, features)


            if registry_modified is not None and modified <= registry_modified:
                logging.info("[%s] Provider '%s' has not been updated since last update. Continuing on to next provider in registry.", JOB_NAME, provider)
                continue

            # Just delete all triples in the context
            # logging.info("[%s] Deleting triples in context %s.", JOB_NAME, provider)
            # repository.delete_triples_about('?s', context=provider)

            data_dumps = void_model[provider_dataset]['dumps']

            for dump in data_dumps:
                logging.info("[%s] Processing provider '%s' dataset '%s' dump '%s'.", JOB_NAME, provider, provider_dataset, dump)

                # Create a temporary name for the file
                outfilename = datetime.datetime.now().strftime("%s-%f")

                try:
                    urllib.urlretrieve(dump, outfilename)
                except:
                    logging.error("[%s] Failed to fetch the void file at '%s'. Skipping dump file.", JOB_NAME, dump)
                    continue

                # Decide the format (from looking at the URL string)
                dump_file_format = "rdfxml"

                # Use another format is we detect a different one
                if dump.endswith('ttl'):
                    dump_file_format = 'turtle'

                # parser = RDF.Parser(name=dump_file_format)

                # Delete triples about each subject (streaming)
                # for statement in parser.parse_as_stream('file:' + outfilename):
                #     # Don't delete statements about non_URI subjects because
                #     # we can't
                #     if not statement.subject.is_resource():
                #         continue
                #
                #     print "Deleting triples in context %s about %s." % (provider, str(statement.subject))
                #     r.delete_triples_about(statement.subject, context=provider)

                # Import the file
                logging.info("[%s] Importing temp file '%s' into named graph '%s'.", JOB_NAME, outfilename, provider)
                repository.import_from_file(outfilename, context=provider, fmt=dump_file_format)

                # Delete the temp file
                try:
                    os.remove(outfilename)
                except:
                    logging.exception("[%s] Failed to delete temporary file located at '%s'.", JOB_NAME, outfilename)

                # Update registry file on disk
                registry_file[provider]['modified'] = modified
                registry.save_registry(registry_filepath, registry_file)