def __init__(self, xml): '''Initializes the codelist class. Parameters @xml: The XML file of the codelist..''' self.xml = xml self.id = AttributeHelper.attribute_key(self.xml, 'name') self.last_updated = AttributeHelper.attribute_key(self.xml, 'date-last-modified')
def get_organisation_defaults(self): '''Returns the defaults of the organisation. Returns @defaults: A dictionary containing the defaults of the activity.''' defaults = dict([('id', self.id), ('language', AttributeHelper.attribute_key(self.xml, '{http://www.w3.org/XML/1998/namespace}lang')), ('currency', AttributeHelper.attribute_key(self.xml, 'default-currency'))]) return defaults
def get_activity_defaults(self): '''Returns the defaults of the activity. Returns @defaults: A dictionary containing the defaults of the activity.''' defaults = dict([('id', self.id), ('language', AttributeHelper.attribute_key(self.xml, '{http://www.w3.org/XML/1998/namespace}lang')), ('currency', AttributeHelper.attribute_key(self.xml, 'default-currency')), ('finance_type', self.get_default_type('default-finance-type')), ('flow_type', self.get_default_type('default-flow-type')), ('aid_type', self.get_default_type('default-aid-type')), ('tied_status', self.get_default_type('default-tied-status')), ('hierarchy', self.hierarchy), ('linked_data_uri', self.linked_data_uri)]) return defaults
def __init__(self, xml): '''Initializes the organisation class. Parameters @xml: An ElementTree of an activity.''' self.xml = xml self.id = self.get_id() self.last_updated = AttributeHelper.attribute_key(self.xml, 'last-updated-datetime')
def get_codelist_defaults(self): '''Retrieves the defaults for the codelist. Return @defaults: A dictionary of defaults.''' defaults = dict([('id', self.id), ('language', AttributeHelper.attribute_key(self.xml, '{http://www.w3.org/XML/1998/namespace}lang'))]) return defaults
def __init__(self, xml, version, linked_data_default): '''Initializes the activity class. Parameters @xml: An ElementTree of an activity. @version: The version of the activities. @linked_data_default: The Linked Data default URI of the activity.''' self.xml = xml self.id = self.get_id() self.last_updated = AttributeHelper.attribute_key(self.xml, 'last-updated-datetime') self.version = self.determine_version(version) self.linked_data_uri = self.determine_linked_data_uri(linked_data_default, self.id) self.hierarchy = AttributeHelper.attribute_key(self.xml, 'hierarchy') self.failed = []
def get_default_type(self, type): '''Returns a default type of the activity. Parameters @type: The element that should be retrieved. Returns @default_type: The default finance type of the activity.''' default_type = self.xml.find(type) if default_type == None: return None return AttributeHelper.attribute_key(default_type, 'code')
def determine_version(self, version): '''Determines the version of this activity. Parameters @version: The version of the iati-activities attribute. Returns @version: The iati-activity or iati-activities attribute version.''' activity_version = AttributeHelper.attribute_key(self.xml, 'version') if not activity_version == None: return activity_version else: return version
def determine_linked_data_uri(self, linked_data_default, id): '''Determines the Linked Data URI of this activity. Parameters @linked_data_default: The version of the iati-activities attribute. @id: The ID of the activity. Returns @linked_data_uri: The Linked Data URI or None if not specified.''' linked_data_uri = AttributeHelper.attribute_key(self.xml, 'linked-data-uri') if not linked_data_uri == None: return linked_data_uri elif not linked_data_default == None: return str(linked_data_default) + str(id) else: return None
def main(): '''Converts Activity XMLs to Turtle files and stores these to local folder.''' # Settings xml_folder = "/media/Acer/School/IATI-data/xml/activities/" turtle_folder = "/media/Acer/School/IATI-data/activity/" Iati = Namespace("http://purl.org/collections/iati/") if not os.path.isdir(turtle_folder): os.makedirs(turtle_folder) document_count = 1 activity_count = 1 failed_elements = [] # Retrieve XML files from the XML folder for document in glob.glob(xml_folder + '*.xml'): activity_ids = [] doc_id = str(document.rsplit('/',1)[1])[:-4] doc_folder = turtle_folder + doc_id + '/' if not os.path.isdir(doc_folder): os.makedirs(doc_folder) failed = False graph = Graph() provenance = Graph() provenance.bind('iati', Iati) # Parse the XML file try: xml = ET.parse(document) except ET.ParseError: print "Could not parse file " + document failed = True # Get the version if not failed == True: root = xml.getroot() version = AttributeHelper.attribute_key(root, 'version') linked_data_default = AttributeHelper.attribute_key(root, 'linked-data-default') # Convert each activity in XML file to RDFLib Graph for activity in xml.findall('iati-activity'): try: converter = IatiConverter.ConvertActivity(activity, version, linked_data_default) graph, id, last_updated, version, fails = converter.convert(Iati) except TypeError as e: print "Error in " + document + ":" + str(e) if not fails == None: for fail in fails: if not fail in failed_elements: failed_elements.append(fail) if (not graph == None) and (not id == None): print "Processing: Activity %s (# %s) in document %s (# %s)" % (str(id.replace('/','%2F')), str(activity_count), str(document.rsplit('/',1)[1]), str(document_count)) # Write activity to Turtle and store in local folder graph_turtle = graph.serialize(format='turtle') with open(doc_folder + str(id.replace('/','%2F')) + '.ttl', 'w') as turtle_file: turtle_file.write(graph_turtle) activity_ids.append(id) else: print "WARNING: Activity (# %s) in %s (# %s) has no identifier specified" % (str(activity_count), str(document.rsplit('/',1)[1]), str(document_count)) activity_count += 1 document_count += 1 # Add provenance from corresponding JSON file json_document = document[:-4] + '.json' try: with open(json_document, 'r') as open_json_doc: json_parsed = json.load(open_json_doc) except: print "Could not parse file " + json_document json_parsed = None provenance_converter = IatiConverter.ConvertProvenance('activity', json_parsed, provenance, doc_id, last_updated, version, activity_ids) provenance = provenance_converter.convert(Iati) # Write provenance graph to Turtle and store in local folder provenance_turtle = provenance.serialize(format='turtle') with open(doc_folder + 'provenance-' + doc_id + '.ttl', 'w') as turtle_file: turtle_file.write(provenance_turtle) print "Failed:" for fail in failed_elements: print fail print "Done!"
def main(): '''Converts Activity XMLs to Turtle files and stores these to local folder.''' # Settings xml_folder = "/media/Acer/School/IATI-data/xml/activities/" turtle_folder = "/media/Acer/School/IATI-data/activity/" Iati = Namespace("http://purl.org/collections/iati/") if not os.path.isdir(turtle_folder): os.makedirs(turtle_folder) document_count = 1 activity_count = 1 failed_elements = [] # Retrieve XML files from the XML folder for document in glob.glob(xml_folder + '*.xml'): activity_ids = [] doc_id = str(document.rsplit('/', 1)[1])[:-4] doc_folder = turtle_folder + doc_id + '/' if not os.path.isdir(doc_folder): os.makedirs(doc_folder) failed = False graph = Graph() provenance = Graph() provenance.bind('iati', Iati) # Parse the XML file try: xml = ET.parse(document) except ET.ParseError: print "Could not parse file " + document failed = True # Get the version if not failed == True: root = xml.getroot() version = AttributeHelper.attribute_key(root, 'version') linked_data_default = AttributeHelper.attribute_key( root, 'linked-data-default') # Convert each activity in XML file to RDFLib Graph for activity in xml.findall('iati-activity'): try: converter = IatiConverter.ConvertActivity( activity, version, linked_data_default) graph, id, last_updated, version, fails = converter.convert( Iati) except TypeError as e: print "Error in " + document + ":" + str(e) if not fails == None: for fail in fails: if not fail in failed_elements: failed_elements.append(fail) if (not graph == None) and (not id == None): print "Processing: Activity %s (# %s) in document %s (# %s)" % ( str(id.replace('/', '%2F')), str(activity_count), str(document.rsplit('/', 1)[1]), str(document_count)) # Write activity to Turtle and store in local folder graph_turtle = graph.serialize(format='turtle') with open( doc_folder + str(id.replace('/', '%2F')) + '.ttl', 'w') as turtle_file: turtle_file.write(graph_turtle) activity_ids.append(id) else: print "WARNING: Activity (# %s) in %s (# %s) has no identifier specified" % ( str(activity_count), str(document.rsplit( '/', 1)[1]), str(document_count)) activity_count += 1 document_count += 1 # Add provenance from corresponding JSON file json_document = document[:-4] + '.json' try: with open(json_document, 'r') as open_json_doc: json_parsed = json.load(open_json_doc) except: print "Could not parse file " + json_document json_parsed = None provenance_converter = IatiConverter.ConvertProvenance( 'activity', json_parsed, provenance, doc_id, last_updated, version, activity_ids) provenance = provenance_converter.convert(Iati) # Write provenance graph to Turtle and store in local folder provenance_turtle = provenance.serialize(format='turtle') with open(doc_folder + 'provenance-' + doc_id + '.ttl', 'w') as turtle_file: turtle_file.write(provenance_turtle) print "Failed:" for fail in failed_elements: print fail print "Done!"
def main(): '''Converts Organisation XMLs to Turtle files and stores these to local folder.''' # Settings xml_folder = "/home/iati/xml/organisations/" turtle_folder = "/home/iati/organisation/" Iati = Namespace("http://purl.org/collections/iati/") if not os.path.isdir(turtle_folder): os.makedirs(turtle_folder) document_count = 1 organisation_count = 1 # Retrieve XML files from the XML folder for document in glob.glob(xml_folder + '*.xml'): organisation_ids = [] doc_fail = False doc_id = str(document.rsplit('/',1)[1])[:-4] doc_folder = turtle_folder + doc_id + '/' if not os.path.isdir(doc_folder): os.makedirs(doc_folder) provenance = Graph() provenance.bind('iati', Iati) # Parse the XML file try: xml = ET.parse(document) except ET.ParseError: print "Could not parse file " + document doc_fail = True if not doc_fail == True: root = xml.getroot() version = AttributeHelper.attribute_key(root, 'version') if (root.tag == 'iati-organisations') or (root.tag == 'organisations'): # Convert each organisation in XML file to RDFLib Graph for organisation in xml.findall('iati-organisation'): try: converter = IatiConverter.ConvertOrganisation(organisation) graph, id, last_updated = converter.convert(Iati) except TypeError as e: print "Error in " + document + ":" + str(e) print "Progress: Organisation #" + str(organisation_count) + " in document #" + str(document_count) if (not graph == None) and (not id == None): # Write organisation to Turtle and store in local folder graph_turtle = graph.serialize(format='turtle') with open(doc_folder + str(id.replace('/','%2F')) + '.ttl', 'w') as turtle_file: turtle_file.write(graph_turtle) organisation_count += 1 organisation_ids.append(id) for organisation in xml.findall('organisation'): try: converter = IatiConverter.ConvertOrganisation(organisation) graph, id, last_updated = converter.convert(Iati) except TypeError as e: print "Error in " + document + ":" + str(e) print "Progress: Organisation #" + str(organisation_count) + " in document #" + str(document_count) if (not graph == None) and (not id == None): # Write organisation to Turtle and store in local folder graph_turtle = graph.serialize(format='turtle') with open(doc_folder + str(id.replace('/','%2F')) + '.ttl', 'w') as turtle_file: turtle_file.write(graph_turtle) organisation_count += 1 organisation_ids.append(id) elif (root.tag == 'iati-organisation') or (root.tag == 'organisation'): try: converter = IatiConverter.ConvertOrganisation(xml.getroot()) graph, id, last_updated = converter.convert(Iati) except TypeError as e: print "Error in " + document + ":" + str(e) print "Progress: Organisation #" + str(organisation_count) + " in document #" + str(document_count) if (not graph == None) and (not id == None): # Write organisation to Turtle and store in local folder graph_turtle = graph.serialize(format='turtle') with open(doc_folder + str(id.replace('/','%2F')) + '.ttl', 'w') as turtle_file: turtle_file.write(graph_turtle) # The following outputs enable the Virutuoso Bulk loader process to put files into the right graphs. with open(doc_folder + str(id.replace('/','%2F')) + '.ttl.graph','w') as graph_file: graph_file.write(str(Iati) + 'graph/organisation/' + str(id)) organisation_count += 1 organisation_ids.append(id) document_count += 1 # Add provenance from corresponding JSON file json_document = document[:-4] + '.json' try: with open(json_document, 'r') as open_json_doc: json_parsed = json.load(open_json_doc) except: print "Could not parse file " + json_document json_parsed = None provenance_converter = IatiConverter.ConvertProvenance('organisation', json_parsed, provenance, doc_id, last_updated, version, organisation_ids) provenance = provenance_converter.convert(Iati) # Write provenance graph to Turtle and store in local folder provenance_turtle = provenance.serialize(format='turtle') with open(doc_folder + 'provenance-' + doc_id + '.ttl', 'w') as turtle_file: turtle_file.write(provenance_turtle) # The following outputs enable the Virutuoso Bulk loader process to put files into the right graphs. with open(doc_folder + 'provenance-' + doc_id + '.ttl.graph','w') as graph_file: graph_file.write(str(Iati) + 'graph/provenance/') print "Done!"
def main(): '''Converts Codelist XMLs to Turtle files and stores these to local folder.''' # Settings xml_folder = "/home/iati/xml/codelists/" turtle_folder = "/home/iati/codelist/" Iati = Namespace("http://purl.org/collections/iati/") if not os.path.isdir(turtle_folder): os.makedirs(turtle_folder) document_count = 1 total_elapsed_time = 0 # Retrieve XML files from the XML folder for document in glob.glob(xml_folder + '*.xml'): doc_id = str(document.rsplit('/',1)[1])[:-4] doc_folder = turtle_folder + doc_id + '/' if not os.path.isdir(doc_folder): os.makedirs(doc_folder) provenance = Graph() provenance.bind('iati', Iati) xml = ET.parse(document) root = xml.getroot() version = AttributeHelper.attribute_key(root, 'version') try: # Convert each codelist in XML file to RDFLib Graph converter = IatiConverter.ConvertCodelist(root) graph, id, last_updated = converter.convert(Iati) except TypeError as e: print "Error in " + document + ":" + str(e) graph = None if not graph == None: # Write codelist to Turtle and store in local folder graph_turtle = graph.serialize(format='turtle') with open(doc_folder + id.replace('/','%2F') + '.ttl', 'w') as turtle_file: turtle_file.write(graph_turtle) # The following outputs enable the Virutuoso Bulk loader process to put files into the right graphs. with open(doc_folder + id.replace('/','%2F') + '.ttl.graph','w') as graph_file: graph_file.write(str(Iati) + 'graph/codelist/' + str(id)) # Add provenance of last-updated, version and source document provenance.add((URIRef(Iati + 'graph/codelist/' + str(id)), URIRef(Iati + 'last-updated'), Literal(last_updated))) provenance.add((URIRef(Iati + 'graph/codelist/' + str(id)), URIRef(Iati + 'version'), Literal(version))) provenance.add((URIRef(Iati + 'graph/codelist/' + str(id)), URIRef(Iati + 'source-document-id'), Literal(str(id)))) provenance.add((URIRef(Iati + 'graph/codelist/' + str(id)), URIRef(Iati + 'source-document-download-url'), URIRef('http://datadev.aidinfolabs.org/data/codelist/' + str(id) + '.xml'))) print "Progress: Document #" + str(document_count) document_count += 1 # Add prov model start_time = datetime.datetime.now() source_xml = str('http://datadev.aidinfolabs.org/data/codelist/' + str(id) + '.xml') entities = [] entities.append(str(id)) script = "conversion%20scripts/CodelistsToTurtle.py" provenance = AddProvenance.addProv(Iati, provenance, 'codelist', doc_id, start_time, source_xml, entities, script) # Write provenance graph to Turtle and store in local folder provenance_turtle = provenance.serialize(format='turtle') with open(doc_folder + 'provenance-' + str(id) + '.ttl', 'w') as turtle_file: turtle_file.write(provenance_turtle) # The following outputs enable the Virutuoso Bulk loader process to put files into the right graphs. with open(doc_folder + 'provenance-' + str(id) + '.ttl.graph','w') as graph_file: graph_file.write(str(Iati) + 'graph/provenance/') print "Done!"
def main(): '''Converts Organisation XMLs to Turtle files and stores these to local folder.''' # Settings xml_folder = "/home/iati/xml/organisations/" turtle_folder = "/home/iati/organisation/" Iati = Namespace("http://purl.org/collections/iati/") if not os.path.isdir(turtle_folder): os.makedirs(turtle_folder) document_count = 1 organisation_count = 1 # Retrieve XML files from the XML folder for document in glob.glob(xml_folder + '*.xml'): organisation_ids = [] doc_fail = False doc_id = str(document.rsplit('/', 1)[1])[:-4] doc_folder = turtle_folder + doc_id + '/' if not os.path.isdir(doc_folder): os.makedirs(doc_folder) provenance = Graph() provenance.bind('iati', Iati) # Parse the XML file try: xml = ET.parse(document) except ET.ParseError: print "Could not parse file " + document doc_fail = True if not doc_fail == True: root = xml.getroot() version = AttributeHelper.attribute_key(root, 'version') if (root.tag == 'iati-organisations') or (root.tag == 'organisations'): # Convert each organisation in XML file to RDFLib Graph for organisation in xml.findall('iati-organisation'): try: converter = IatiConverter.ConvertOrganisation( organisation) graph, id, last_updated = converter.convert(Iati) except TypeError as e: print "Error in " + document + ":" + str(e) print "Progress: Organisation #" + str( organisation_count) + " in document #" + str( document_count) if (not graph == None) and (not id == None): # Write organisation to Turtle and store in local folder graph_turtle = graph.serialize(format='turtle') with open( doc_folder + str(id.replace('/', '%2F')) + '.ttl', 'w') as turtle_file: turtle_file.write(graph_turtle) organisation_count += 1 organisation_ids.append(id) for organisation in xml.findall('organisation'): try: converter = IatiConverter.ConvertOrganisation( organisation) graph, id, last_updated = converter.convert(Iati) except TypeError as e: print "Error in " + document + ":" + str(e) print "Progress: Organisation #" + str( organisation_count) + " in document #" + str( document_count) if (not graph == None) and (not id == None): # Write organisation to Turtle and store in local folder graph_turtle = graph.serialize(format='turtle') with open( doc_folder + str(id.replace('/', '%2F')) + '.ttl', 'w') as turtle_file: turtle_file.write(graph_turtle) organisation_count += 1 organisation_ids.append(id) elif (root.tag == 'iati-organisation') or (root.tag == 'organisation'): try: converter = IatiConverter.ConvertOrganisation( xml.getroot()) graph, id, last_updated = converter.convert(Iati) except TypeError as e: print "Error in " + document + ":" + str(e) print "Progress: Organisation #" + str( organisation_count) + " in document #" + str( document_count) if (not graph == None) and (not id == None): # Write organisation to Turtle and store in local folder graph_turtle = graph.serialize(format='turtle') with open( doc_folder + str(id.replace('/', '%2F')) + '.ttl', 'w') as turtle_file: turtle_file.write(graph_turtle) # The following outputs enable the Virutuoso Bulk loader process to put files into the right graphs. with open( doc_folder + str(id.replace('/', '%2F')) + '.ttl.graph', 'w') as graph_file: graph_file.write( str(Iati) + 'graph/organisation/' + str(id)) organisation_count += 1 organisation_ids.append(id) document_count += 1 # Add provenance from corresponding JSON file json_document = document[:-4] + '.json' try: with open(json_document, 'r') as open_json_doc: json_parsed = json.load(open_json_doc) except: print "Could not parse file " + json_document json_parsed = None provenance_converter = IatiConverter.ConvertProvenance( 'organisation', json_parsed, provenance, doc_id, last_updated, version, organisation_ids) provenance = provenance_converter.convert(Iati) # Write provenance graph to Turtle and store in local folder provenance_turtle = provenance.serialize(format='turtle') with open(doc_folder + 'provenance-' + doc_id + '.ttl', 'w') as turtle_file: turtle_file.write(provenance_turtle) # The following outputs enable the Virutuoso Bulk loader process to put files into the right graphs. with open(doc_folder + 'provenance-' + doc_id + '.ttl.graph', 'w') as graph_file: graph_file.write(str(Iati) + 'graph/provenance/') print "Done!"