def update_publisher(self, publisher_iati_id, publisher_abbreviation, publisher_name): try: # if already in the database, get the publisher_id, else add the publisher if (models.Publisher.objects.filter( org_id=publisher_iati_id).exists()): current_publisher = models.Publisher.objects.get( org_id=publisher_iati_id) else: # get the abbreviation from organisation_identifier table if (OrganisationIdentifier.objects.filter( code=publisher_iati_id).exists()): current_publisher_meta = OrganisationIdentifier.objects.get( code=publisher_iati_id) publisher_abbreviation = current_publisher_meta.abbreviation current_publisher = self.add_publisher_to_db( publisher_iati_id, publisher_abbreviation, publisher_name) return current_publisher except Exception as e: exception_handler(e, publisher_iati_id, "dataset_syncer.update_publisher")
def synchronize_with_iati_api_by_page(self, url, data_type, try_number=0): """ Loop through the datasets by page """ # TODO: Clean this function try: req = urllib2.Request(url) opener = urllib2.build_opener() f = opener.open(req) json_objects = json.load(f) if json_objects is not None: # For each dataset object for line in json_objects['results']: try: self.parse_json_line(line, data_type) except Exception as e: exception_handler( e, 'synchronize_with_iati_api_by_page', "Unexpected error") except (urllib2.HTTPError, urllib2.URLError, httplib.HTTPException), e: exception_handler(e, "HTTP error", url) if try_number < 4: self.synchronize_with_iati_api_by_page( url, data_type, try_number + 1) else: return None
def synchronize_with_iati_api_by_page(self, url, data_type, try_number=0): """ Loop through the datasets by page """ # TODO: Clean this function try: req = urllib2.Request(url) opener = urllib2.build_opener() f = opener.open(req) json_objects = json.load(f) if json_objects is not None: # For each dataset object for line in json_objects['results']: try: self.parse_json_line(line, data_type) except Exception as e: exception_handler(e, 'synchronize_with_iati_api_by_page', "Unexpected error") except (urllib2.HTTPError, urllib2.URLError, httplib.HTTPException), e: exception_handler(e, "HTTP error", url) if try_number < 4: self.synchronize_with_iati_api_by_page(url, data_type, try_number + 1) else: return None
def remove_values_for_activity(self, cur_activity): try: models.ActivityParticipatingOrganisation.objects.filter( activity=cur_activity).delete() models.ActivityPolicyMarker.objects.filter( activity=cur_activity).delete() models.ActivitySector.objects.filter( activity=cur_activity).delete() models.ActivityRecipientCountry.objects.filter( activity=cur_activity).delete() models.CountryBudgetItem.objects.filter( activity=cur_activity).delete() models.ActivityRecipientRegion.objects.filter( activity=cur_activity).delete() models.OtherIdentifier.objects.filter( activity=cur_activity).delete() models.ActivityWebsite.objects.filter( activity=cur_activity).delete() models.ContactInfo.objects.filter(activity=cur_activity).delete() models.Transaction.objects.filter(activity=cur_activity).delete() models.PlannedDisbursement.objects.filter( activity=cur_activity).delete() models.DocumentLink.objects.filter(activity=cur_activity).delete() models.RelatedActivity.objects.filter( current_activity=cur_activity).delete() models.Title.objects.filter(activity=cur_activity).delete() models.Description.objects.filter(activity=cur_activity).delete() models.Location.objects.filter(activity=cur_activity).delete() models.Budget.objects.filter(activity=cur_activity).delete() models.Condition.objects.filter(activity=cur_activity).delete() models.ActivitySearchData.objects.filter( activity=cur_activity).delete() for r in models.Result.objects.filter(activity=cur_activity): for ri in models.ResultIndicator.objects.filter(result=r): models.ResultIndicatorPeriod.objects.filter( result_indicator=ri).delete() ri.delete() r.delete() for f in models.Ffs.objects.filter(activity=cur_activity): models.FfsForecast.objects.filter(ffs=f).delete() f.delete() for c in models.CrsAdd.objects.filter(activity=cur_activity): models.CrsAddLoanStatus.objects.filter(crs_add=c) models.CrsAddLoanTerms.objects.filter(crs_add=c) c.delete() cur_activity.delete() except Exception as e: exception_handler(e, cur_activity.id, "remove_values_for_activity")
def delete_by_source(self, xml_source_ref): try: activities = models.Activity.objects.filter(xml_source_ref=xml_source_ref) for activity in activities: self.remove_values_for_activity(activity) except Exception as e: exception_handler(e, xml_source_ref, "delete_by_source")
def delete_by_source(self, xml_source_ref): try: activities = models.Activity.objects.filter( xml_source_ref=xml_source_ref) for activity in activities: self.remove_values_for_activity(activity) except Exception as e: exception_handler(e, xml_source_ref, "delete_by_source")
def parse_json_line(self, line, data_type): """ Parse line from IATI response """ try: publisher_iati_id = line['extras']['publisher_iati_id'] except KeyError: publisher_iati_id = None publisher_abbreviation = '' publisher_name = 'Unknown' try: source_url = str(line['res_url'][0]).replace(' ', '%20') except IndexError: source_url = '' source_name = line.get('name', '') source_title = line.get('title', '') try: data_dict = json.loads(line.get('data_dict', '')) publisher_name = data_dict['organization']['title'] except (ValueError, KeyError): pass except Exception as e: msg = ("Unexpected error in synchronize_with_iati_api_by_page " "organisation match:") exception_handler(e, 'synchronize_with_iati_api_by_page', msg) if source_url not in self.source_urls: if publisher_iati_id: current_publisher = self.update_publisher( publisher_iati_id, publisher_abbreviation, publisher_name) else: current_publisher = self.add_publisher_to_db( 'Unknown', publisher_abbreviation, publisher_name) self.add_iati_xml_source_to_db(source_url, source_title, source_name, current_publisher, data_type) self.source_urls.append(source_url) else: msg = "Updated publisher and last found in registry on: " exception_handler(None, msg, source_url) source = models.IatiXmlSource.objects.get(source_url=source_url) source.last_found_in_registry = datetime.datetime.now() if source.publisher.org_id != publisher_iati_id: new_publisher = self.update_publisher(publisher_iati_id, publisher_abbreviation, publisher_name) source.publisher = new_publisher source.save(process=False, added_manually=False)
def remove_values_for_activity(self, cur_activity): try: models.ActivityParticipatingOrganisation.objects.filter(activity=cur_activity).delete() models.ActivityPolicyMarker.objects.filter(activity=cur_activity).delete() models.ActivitySector.objects.filter(activity=cur_activity).delete() models.ActivityRecipientCountry.objects.filter(activity=cur_activity).delete() models.CountryBudgetItem.objects.filter(activity=cur_activity).delete() models.ActivityRecipientRegion.objects.filter(activity=cur_activity).delete() models.OtherIdentifier.objects.filter(activity=cur_activity).delete() models.ActivityWebsite.objects.filter(activity=cur_activity).delete() models.ContactInfo.objects.filter(activity=cur_activity).delete() models.Transaction.objects.filter(activity=cur_activity).delete() models.PlannedDisbursement.objects.filter(activity=cur_activity).delete() models.DocumentLink.objects.filter(activity=cur_activity).delete() models.RelatedActivity.objects.filter(current_activity=cur_activity).delete() models.Title.objects.filter(activity=cur_activity).delete() models.Description.objects.filter(activity=cur_activity).delete() models.Location.objects.filter(activity=cur_activity).delete() models.Budget.objects.filter(activity=cur_activity).delete() models.Condition.objects.filter(activity=cur_activity).delete() models.ActivitySearchData.objects.filter(activity=cur_activity).delete() for r in models.Result.objects.filter(activity=cur_activity): for ri in models.ResultIndicator.objects.filter(result=r): models.ResultIndicatorPeriod.objects.filter(result_indicator=ri).delete() ri.delete() r.delete() for f in models.Ffs.objects.filter(activity=cur_activity): models.FfsForecast.objects.filter(ffs=f).delete() f.delete() for c in models.CrsAdd.objects.filter(activity=cur_activity): models.CrsAddLoanStatus.objects.filter(crs_add=c) models.CrsAddLoanTerms.objects.filter(crs_add=c) c.delete() cur_activity.delete() except Exception as e: exception_handler(e, cur_activity.id, "remove_values_for_activity")
def update_publisher(self, publisher_iati_id, publisher_abbreviation, publisher_name): try: # if already in the database, get the publisher_id, else add the publisher if (models.Publisher.objects.filter(org_id=publisher_iati_id).exists()): current_publisher = models.Publisher.objects.get(org_id=publisher_iati_id) else: # get the abbreviation from organisation_identifier table if(OrganisationIdentifier.objects.filter(code=publisher_iati_id).exists()): current_publisher_meta = OrganisationIdentifier.objects.get(code=publisher_iati_id) publisher_abbreviation = current_publisher_meta.abbreviation current_publisher = self.add_publisher_to_db(publisher_iati_id, publisher_abbreviation, publisher_name) return current_publisher return current_publisher except Exception as e: exception_handler(e, publisher_iati_id, "dataset_syncer.update_publisher")
def parse_url(self, source): """ Parses the source with url """ url = source.source_url xml_source_ref = source.ref # last_hash = source.last_hash try: file_grabber = FileGrabber() iati_file = file_grabber.get_the_file(url) if iati_file: # delete old activities # TODO: determine this in the parser based on last-updated-datetime # TODO: also, throw away all narratives # try: # deleter = Deleter() # deleter.delete_by_source(xml_source_ref) # except Exception as e: # exception_handler(e, "parse url", "delete by source") data = iati_file.read() root = etree.fromstring(str(data)) parser = self.prepare_parser(root, source) parser.load_and_parse(root) # Throw away query logs when in debug mode to prevent memory from overflowing if settings.DEBUG: from django import db db.reset_queries() except Exception as e: exception_handler(e, "parse url", "parse_url")
def parse_json_line(self, line, data_type): """ Parse line from IATI response """ try: publisher_iati_id = line['extras']['publisher_iati_id'] except KeyError: publisher_iati_id = None publisher_abbreviation = '' publisher_name = 'Unknown' try: source_url = str(line['res_url'][0]).replace(' ', '%20') except IndexError: source_url = '' source_name = line.get('name', '') source_title = line.get('title', '') try: data_dict = json.loads(line.get('data_dict', '')) publisher_name = data_dict['organization']['title'] except (ValueError, KeyError): pass except Exception as e: msg = ("Unexpected error in synchronize_with_iati_api_by_page " "organisation match:") exception_handler(e, 'synchronize_with_iati_api_by_page', msg) if source_url not in self.source_urls: if publisher_iati_id: current_publisher = self.update_publisher( publisher_iati_id, publisher_abbreviation, publisher_name) else: current_publisher = self.add_publisher_to_db( 'Unknown', publisher_abbreviation, publisher_name) self.add_iati_xml_source_to_db( source_url, source_title, source_name, current_publisher, data_type) self.source_urls.append(source_url) else: msg = "Updated publisher and last found in registry on: " exception_handler(None, msg, source_url) source = models.IatiXmlSource.objects.get(source_url=source_url) source.last_found_in_registry = datetime.datetime.now() if source.publisher.org_id != publisher_iati_id: new_publisher = self.update_publisher( publisher_iati_id, publisher_abbreviation, publisher_name) source.publisher = new_publisher source.save(process=False, added_manually=False)
class DatasetSyncer(): # Start looping through the datasets def synchronize_with_iati_api(self, type): if type == 1: cur_url = "http://www.iatiregistry.org/api/search/dataset?extras_filetype=activity&all_fields=1&limit=200&offset=" if type == 2: cur_url = "http://www.iatiregistry.org/api/search/dataset?extras_filetype=organisation&all_fields=1&limit=200&offset=" for i in range(0, 10000, 200): cur_url = cur_url.strip().replace(" ", "%20") cur_url_with_offset = cur_url + str(i) self.synchronize_with_iati_api_by_page(cur_url_with_offset, type) # Loop through the datasets by page def synchronize_with_iati_api_by_page(self, cur_url, cur_type, try_number=0): try: req = urllib2.Request(cur_url) opener = urllib2.build_opener() f = opener.open(req) json_objects = json.load(f) if not (json_objects is None): # For each dataset object for object in json_objects["results"]: try: publisher_iati_id = None publisher_abbreviation = None publisher_name = "Unknown" source_url = str(object["res_url"][0]) source_url = source_url.replace(" ", "%20") source_name = object["name"] source_title = None if "title" in object: source_title = object["title"] if "publisher_iati_id" in object["extras"]: publisher_iati_id = object["extras"][ "publisher_iati_id"] if "data_dict" in object: try: data_dict = json.loads(object["data_dict"]) if "organization" in data_dict and data_dict[ "organization"]: if "title" in data_dict["organization"]: publisher_name = data_dict[ "organization"]["title"] except Exception as e: exception_handler( e, "synchronize_with_iati_api_by_page", "Unexpected error in synchronize_with_iati_api_by_page organisation match:" ) # If download url is not already in OIPA if not models.IatiXmlSource.objects.filter( source_url=source_url).exists(): # If publisher_iati_id is given if publisher_iati_id and (publisher_iati_id != ""): current_publisher = self.update_publisher( publisher_iati_id, publisher_abbreviation, publisher_name) else: # else publisher is unknown current_publisher = self.add_publisher_to_db( "Unknown", "Unknown", "Unknown") self.add_iati_xml_source_to_db( source_url, source_title, source_name, current_publisher, cur_type) else: exception_handler( None, "Updated publisher and last found in registry on: ", source_url) cursource = models.IatiXmlSource.objects.get( source_url=source_url) cursource.last_found_in_registry = datetime.datetime.now( ) current_publisher = cursource.publisher #check if publisher meta is already known, if not, add it and check if the known publisher already existed and add it to the source if (cursource.publisher.org_id != publisher_iati_id): new_current_publisher = self.update_publisher( publisher_iati_id, publisher_abbreviation, publisher_name) cursource.publisher = new_current_publisher cursource.save(process=False, added_manually=False) except Exception as e: exception_handler(e, "synchronize_with_iati_api_by_page", "Unexpected error") except urllib2.HTTPError, e: exception_handler(e, "HTTP error", cur_url) if try_number < 6: self.synchronize_with_iati_api_by_page(cur_url, cur_type, try_number + 1) else: return None except urllib2.URLError, e: exception_handler(e, "URL error", cur_url) if try_number < 6: self.synchronize_with_iati_api_by_page(cur_url, cur_type, try_number + 1)
"Unexpected error") except urllib2.HTTPError, e: exception_handler(e, "HTTP error", cur_url) if try_number < 6: self.synchronize_with_iati_api_by_page(cur_url, cur_type, try_number + 1) else: return None except urllib2.URLError, e: exception_handler(e, "URL error", cur_url) if try_number < 6: self.synchronize_with_iati_api_by_page(cur_url, cur_type, try_number + 1) except httplib.HTTPException, e: exception_handler(e, "HTTP Exception", cur_url) if try_number < 6: self.synchronize_with_iati_api_by_page(cur_url, cur_type, try_number + 1) def update_publisher(self, publisher_iati_id, publisher_abbreviation, publisher_name): try: # if already in the database, get the publisher_id, else add the publisher if (models.Publisher.objects.filter( org_id=publisher_iati_id).exists()): current_publisher = models.Publisher.objects.get( org_id=publisher_iati_id) else: # get the abbreviation from organisation_identifier table
def synchronize_with_iati_api_by_page(self, cur_url, cur_type, try_number = 0): try: req = urllib2.Request(cur_url) opener = urllib2.build_opener() f = opener.open(req) json_objects = json.load(f) if not (json_objects is None): # For each dataset object for object in json_objects["results"]: try: publisher_iati_id = None publisher_abbreviation = None publisher_name = "Unknown" source_url = str(object["res_url"][0]) source_url = source_url.replace(" ", "%20") source_name = object["name"] source_title = None if "title" in object: source_title = object["title"] if "publisher_iati_id" in object["extras"]: publisher_iati_id = object["extras"]["publisher_iati_id"] if "data_dict" in object: try: data_dict = json.loads(object["data_dict"]) if "organization" in data_dict and data_dict["organization"]: if "title" in data_dict["organization"]: publisher_name = data_dict["organization"]["title"] except Exception as e: exception_handler(e, "synchronize_with_iati_api_by_page", "Unexpected error in synchronize_with_iati_api_by_page organisation match:") # If download url is not already in OIPA if not models.IatiXmlSource.objects.filter(source_url=source_url).exists(): # If publisher_iati_id is given if publisher_iati_id and (publisher_iati_id != ""): current_publisher = self.update_publisher(publisher_iati_id, publisher_abbreviation, publisher_name) else: # else publisher is unknown current_publisher = self.add_publisher_to_db("Unknown", "Unknown", "Unknown") self.add_iati_xml_source_to_db(source_url, source_title, source_name, current_publisher, cur_type) else: exception_handler(None, "Updated publisher and last found in registry on: ", source_url) cursource = models.IatiXmlSource.objects.get(source_url=source_url) cursource.last_found_in_registry = datetime.datetime.now() current_publisher = cursource.publisher #check if publisher meta is already known, if not, add it and check if the known publisher already existed and add it to the source if (cursource.publisher.org_id != publisher_iati_id): new_current_publisher = self.update_publisher(publisher_iati_id, publisher_abbreviation, publisher_name) cursource.publisher = new_current_publisher cursource.save(process=False, added_manually=False) except Exception as e: exception_handler(e, "synchronize_with_iati_api_by_page", "Unexpected error") except urllib2.HTTPError, e: exception_handler(e, "HTTP error", cur_url) if try_number < 6: self.synchronize_with_iati_api_by_page(cur_url, cur_type,try_number + 1) else: return None
exception_handler(e, "synchronize_with_iati_api_by_page", "Unexpected error") except urllib2.HTTPError, e: exception_handler(e, "HTTP error", cur_url) if try_number < 6: self.synchronize_with_iati_api_by_page(cur_url, cur_type,try_number + 1) else: return None except urllib2.URLError, e: exception_handler(e, "URL error", cur_url) if try_number < 6: self.synchronize_with_iati_api_by_page(cur_url, cur_type,try_number + 1) except httplib.HTTPException, e: exception_handler(e, "HTTP Exception", cur_url) if try_number < 6: self.synchronize_with_iati_api_by_page(cur_url, cur_type,try_number + 1) def update_publisher(self, publisher_iati_id, publisher_abbreviation, publisher_name): try: # if already in the database, get the publisher_id, else add the publisher if (models.Publisher.objects.filter(org_id=publisher_iati_id).exists()): current_publisher = models.Publisher.objects.get(org_id=publisher_iati_id) else: # get the abbreviation from organisation_identifier table if(OrganisationIdentifier.objects.filter(code=publisher_iati_id).exists()): current_publisher_meta = OrganisationIdentifier.objects.get(code=publisher_iati_id) publisher_abbreviation = current_publisher_meta.abbreviation