def test_cant_load_a_bogus_string_into_a_model(): turtle = """ bogus ttl content """ parsed = util.load_string_into_model(turtle, fmt="turtle") assert parsed is None
def main(): args = sys.argv if len(args) < 2: print "Wrong number of arguments. Try `python parse_void.py http://someurl.com`" return # Decide format to use (default or provided) void_format = "rdfxml" if len(args) == 3 and len(args[2]) > 0: void_format = args[2] filepath = args[1] print "Getting VoID file at %s." % filepath # If web... if filepath.startswith("http"): r = requests.get(args[1]) if r.status_code != 200: print "Status code was not 200. Was %d instead. Exiting." % r.status_code return void_text = r.text else: if not os.path.isfile(filepath): print "Couldn't find file locally and didn't think file was a web URL (%s). Exiting." % filepath void_text = None with open(filepath, 'rb') as f: void_text = f.read() if void_text is None: print "Failed to read in void file from %s. Exiting." % filepath model = util.load_string_into_model(void_text, void_format) parsed = void.parse_void_model(model) print parsed
def test_can_load_a_string_into_a_model(): turtle = """ @base <http://example.org/> . @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . @prefix foaf: <http://xmlns.com/foaf/0.1/> . @prefix rel: <http://www.perceive.net/schemas/relationship/> . <#green-goblin> rel:enemyOf <#spiderman> ; a foaf:Person ; # in the context of the Marvel universe foaf:name "Green Goblin" . <#spiderman> rel:enemyOf <#green-goblin> ; a foaf:Person ; foaf:name "Spiderman" . """ parsed = util.load_string_into_model(turtle, fmt="turtle") assert parsed is not None
def update(): """Update the contents of the Harvester. Reads the registry file and does work depending on what it finds. """ JOB_NAME = "UPDATE" logging.info("[%s] Job started.", JOB_NAME) # Set up repository connection store = Store(SESAME_HOST, SESAME_PORT) repository = Repository(store, SESAME_REPOSITORY) # Establish the location of the registry file # The else case will run when not running in a container if os.path.isfile('/glharvest/registry.yml'): registry_filepath = '/glharvest/registry.yml' else: registry_filepath = 'registry.yml' logging.info("[%s] Loading registry file from location '%s'.", JOB_NAME, registry_filepath) if not os.path.isfile: raise Exception("Couldn't locate the registry file at the provided path: %s. Exiting.", registry_filepath) registry_file = registry.parse_registry_file(registry_filepath) logging.info("[%s] Registry file parsed. %d provider(s) found.", JOB_NAME, len(registry_file)) for provider in registry_file: logging.info("[%s] Processing provider '%s'.", JOB_NAME, provider) # Check for VoID if 'void' not in registry_file[provider]: logging.error("[%s] Location of VoID dataset not found for this provider in the registry. Skipping.", JOB_NAME) continue voidfile = registry_file[provider]['void'] # Check for modified value if 'modified' not in registry_file[provider]: logging.info("[%s] Modified date(time) was not found in the registry for provider '%s'. This will force harvesting for this provider.", JOB_NAME, provider) registry_modified = None else: registry_modified = registry_file[provider]['modified'] registry_modified = parse(str(registry_modified), ignoretz=True) logging.info("[%s] Provider '%s' was last modified at '%s'.", JOB_NAME, provider, registry_modified) # Get and parse the VoID file logging.info("[%s] Attemting to retrieve VoID file from location %s.", JOB_NAME, voidfile) try: r = requests.get(voidfile) except: logging.error("[%s] Failed to get voidfile located at `%s`. Skipping.", JOB_NAME, voidfile) continue void_string_format = "rdfxml" # Use another format is we detect a different one if voidfile.endswith('ttl'): void_string_format = 'turtle' model = util.load_string_into_model(r.text, fmt=void_string_format) void_model = void.parse_void_model(model) for provider_dataset in void_model: logging.info("[%s] Processing provider VoID:Dataset '%s'.", JOB_NAME, provider_dataset) if 'modified' not in void_model[provider_dataset]: logging.error("[%s] VoID:Dataset found in VoID dump did not have dcterms:modified value. Skipping.", JOB_NAME) continue if 'features' not in void_model[provider_dataset]: logging.error("[%s] VoID:Dataset has no feature declaration. Skipping.", JOB_NAME) continue modified = void_model[provider_dataset]['modified'] try: modified = parse(modified, ignoretz=True) except: logging.error("[%s] Failed to parse modified time string of %s. Skipping.", JOB_NAME, modified) continue # TODO process features features = void_model[provider_dataset]['features'] logging.info("[%s] Found features: %s", JOB_NAME, features) if registry_modified is not None and modified <= registry_modified: logging.info("[%s] Provider '%s' has not been updated since last update. Continuing on to next provider in registry.", JOB_NAME, provider) continue # Just delete all triples in the context # logging.info("[%s] Deleting triples in context %s.", JOB_NAME, provider) # repository.delete_triples_about('?s', context=provider) data_dumps = void_model[provider_dataset]['dumps'] for dump in data_dumps: logging.info("[%s] Processing provider '%s' dataset '%s' dump '%s'.", JOB_NAME, provider, provider_dataset, dump) # Create a temporary name for the file outfilename = datetime.datetime.now().strftime("%s-%f") try: urllib.urlretrieve(dump, outfilename) except: logging.error("[%s] Failed to fetch the void file at '%s'. Skipping dump file.", JOB_NAME, dump) continue # Decide the format (from looking at the URL string) dump_file_format = "rdfxml" # Use another format is we detect a different one if dump.endswith('ttl'): dump_file_format = 'turtle' # parser = RDF.Parser(name=dump_file_format) # Delete triples about each subject (streaming) # for statement in parser.parse_as_stream('file:' + outfilename): # # Don't delete statements about non_URI subjects because # # we can't # if not statement.subject.is_resource(): # continue # # print "Deleting triples in context %s about %s." % (provider, str(statement.subject)) # r.delete_triples_about(statement.subject, context=provider) # Import the file logging.info("[%s] Importing temp file '%s' into named graph '%s'.", JOB_NAME, outfilename, provider) repository.import_from_file(outfilename, context=provider, fmt=dump_file_format) # Delete the temp file try: os.remove(outfilename) except: logging.exception("[%s] Failed to delete temporary file located at '%s'.", JOB_NAME, outfilename) # Update registry file on disk registry_file[provider]['modified'] = modified registry.save_registry(registry_filepath, registry_file)