示例#1
0
    def update_publisher(self, publisher_iati_id, publisher_abbreviation,
                         publisher_name):

        try:
            #   if already in the database, get the publisher_id, else add the publisher
            if (models.Publisher.objects.filter(
                    org_id=publisher_iati_id).exists()):
                current_publisher = models.Publisher.objects.get(
                    org_id=publisher_iati_id)
            else:
                # get the abbreviation from organisation_identifier table
                if (OrganisationIdentifier.objects.filter(
                        code=publisher_iati_id).exists()):
                    current_publisher_meta = OrganisationIdentifier.objects.get(
                        code=publisher_iati_id)
                    publisher_abbreviation = current_publisher_meta.abbreviation

                current_publisher = self.add_publisher_to_db(
                    publisher_iati_id, publisher_abbreviation, publisher_name)

            return current_publisher

        except Exception as e:
            exception_handler(e, publisher_iati_id,
                              "dataset_syncer.update_publisher")
示例#2
0
    def synchronize_with_iati_api_by_page(self, url, data_type, try_number=0):
        """
        Loop through the datasets by page
        """
        # TODO: Clean this function
        try:
            req = urllib2.Request(url)
            opener = urllib2.build_opener()
            f = opener.open(req)
            json_objects = json.load(f)

            if json_objects is not None:
                # For each dataset object
                for line in json_objects['results']:
                    try:
                        self.parse_json_line(line, data_type)
                    except Exception as e:
                        exception_handler(
                            e,
                            'synchronize_with_iati_api_by_page',
                            "Unexpected error")

        except (urllib2.HTTPError, urllib2.URLError, httplib.HTTPException), e:
            exception_handler(e, "HTTP error", url)

            if try_number < 4:
                self.synchronize_with_iati_api_by_page(
                    url,
                    data_type,
                    try_number + 1)
            else:
                return None
示例#3
0
    def synchronize_with_iati_api_by_page(self, url, data_type, try_number=0):
        """
        Loop through the datasets by page
        """
        # TODO: Clean this function
        try:
            req = urllib2.Request(url)
            opener = urllib2.build_opener()
            f = opener.open(req)
            json_objects = json.load(f)

            if json_objects is not None:
                # For each dataset object
                for line in json_objects['results']:
                    try:
                        self.parse_json_line(line, data_type)
                    except Exception as e:
                        exception_handler(e,
                                          'synchronize_with_iati_api_by_page',
                                          "Unexpected error")

        except (urllib2.HTTPError, urllib2.URLError, httplib.HTTPException), e:
            exception_handler(e, "HTTP error", url)

            if try_number < 4:
                self.synchronize_with_iati_api_by_page(url, data_type,
                                                       try_number + 1)
            else:
                return None
示例#4
0
    def remove_values_for_activity(self, cur_activity):

        try:
            models.ActivityParticipatingOrganisation.objects.filter(
                activity=cur_activity).delete()
            models.ActivityPolicyMarker.objects.filter(
                activity=cur_activity).delete()
            models.ActivitySector.objects.filter(
                activity=cur_activity).delete()
            models.ActivityRecipientCountry.objects.filter(
                activity=cur_activity).delete()

            models.CountryBudgetItem.objects.filter(
                activity=cur_activity).delete()
            models.ActivityRecipientRegion.objects.filter(
                activity=cur_activity).delete()
            models.OtherIdentifier.objects.filter(
                activity=cur_activity).delete()
            models.ActivityWebsite.objects.filter(
                activity=cur_activity).delete()

            models.ContactInfo.objects.filter(activity=cur_activity).delete()
            models.Transaction.objects.filter(activity=cur_activity).delete()
            models.PlannedDisbursement.objects.filter(
                activity=cur_activity).delete()
            models.DocumentLink.objects.filter(activity=cur_activity).delete()

            models.RelatedActivity.objects.filter(
                current_activity=cur_activity).delete()
            models.Title.objects.filter(activity=cur_activity).delete()
            models.Description.objects.filter(activity=cur_activity).delete()

            models.Location.objects.filter(activity=cur_activity).delete()
            models.Budget.objects.filter(activity=cur_activity).delete()
            models.Condition.objects.filter(activity=cur_activity).delete()

            models.ActivitySearchData.objects.filter(
                activity=cur_activity).delete()

            for r in models.Result.objects.filter(activity=cur_activity):
                for ri in models.ResultIndicator.objects.filter(result=r):
                    models.ResultIndicatorPeriod.objects.filter(
                        result_indicator=ri).delete()
                    ri.delete()
                r.delete()

            for f in models.Ffs.objects.filter(activity=cur_activity):
                models.FfsForecast.objects.filter(ffs=f).delete()
                f.delete()

            for c in models.CrsAdd.objects.filter(activity=cur_activity):
                models.CrsAddLoanStatus.objects.filter(crs_add=c)
                models.CrsAddLoanTerms.objects.filter(crs_add=c)
                c.delete()

            cur_activity.delete()

        except Exception as e:
            exception_handler(e, cur_activity.id, "remove_values_for_activity")
示例#5
0
    def delete_by_source(self, xml_source_ref):
        try:
            activities = models.Activity.objects.filter(xml_source_ref=xml_source_ref)
            for activity in activities:
                self.remove_values_for_activity(activity)

        except Exception as e:
            exception_handler(e, xml_source_ref, "delete_by_source")
示例#6
0
    def delete_by_source(self, xml_source_ref):
        try:
            activities = models.Activity.objects.filter(
                xml_source_ref=xml_source_ref)
            for activity in activities:
                self.remove_values_for_activity(activity)

        except Exception as e:
            exception_handler(e, xml_source_ref, "delete_by_source")
示例#7
0
    def parse_json_line(self, line, data_type):
        """
        Parse line from IATI response
        """
        try:
            publisher_iati_id = line['extras']['publisher_iati_id']
        except KeyError:
            publisher_iati_id = None

        publisher_abbreviation = ''
        publisher_name = 'Unknown'
        try:
            source_url = str(line['res_url'][0]).replace(' ', '%20')
        except IndexError:
            source_url = ''
        source_name = line.get('name', '')
        source_title = line.get('title', '')

        try:
            data_dict = json.loads(line.get('data_dict', ''))
            publisher_name = data_dict['organization']['title']
        except (ValueError, KeyError):
            pass
        except Exception as e:
            msg = ("Unexpected error in synchronize_with_iati_api_by_page "
                   "organisation match:")
            exception_handler(e, 'synchronize_with_iati_api_by_page', msg)

        if source_url not in self.source_urls:

            if publisher_iati_id:
                current_publisher = self.update_publisher(
                    publisher_iati_id, publisher_abbreviation, publisher_name)
            else:
                current_publisher = self.add_publisher_to_db(
                    'Unknown', publisher_abbreviation, publisher_name)

            self.add_iati_xml_source_to_db(source_url, source_title,
                                           source_name, current_publisher,
                                           data_type)

            self.source_urls.append(source_url)

        else:
            msg = "Updated publisher and last found in registry on: "
            exception_handler(None, msg, source_url)

            source = models.IatiXmlSource.objects.get(source_url=source_url)
            source.last_found_in_registry = datetime.datetime.now()

            if source.publisher.org_id != publisher_iati_id:
                new_publisher = self.update_publisher(publisher_iati_id,
                                                      publisher_abbreviation,
                                                      publisher_name)
                source.publisher = new_publisher
            source.save(process=False, added_manually=False)
示例#8
0
    def remove_values_for_activity(self, cur_activity):

        try:
            models.ActivityParticipatingOrganisation.objects.filter(activity=cur_activity).delete()
            models.ActivityPolicyMarker.objects.filter(activity=cur_activity).delete()
            models.ActivitySector.objects.filter(activity=cur_activity).delete()
            models.ActivityRecipientCountry.objects.filter(activity=cur_activity).delete()

            models.CountryBudgetItem.objects.filter(activity=cur_activity).delete()
            models.ActivityRecipientRegion.objects.filter(activity=cur_activity).delete()
            models.OtherIdentifier.objects.filter(activity=cur_activity).delete()
            models.ActivityWebsite.objects.filter(activity=cur_activity).delete()

            models.ContactInfo.objects.filter(activity=cur_activity).delete()
            models.Transaction.objects.filter(activity=cur_activity).delete()
            models.PlannedDisbursement.objects.filter(activity=cur_activity).delete()
            models.DocumentLink.objects.filter(activity=cur_activity).delete()

            models.RelatedActivity.objects.filter(current_activity=cur_activity).delete()
            models.Title.objects.filter(activity=cur_activity).delete()
            models.Description.objects.filter(activity=cur_activity).delete()

            models.Location.objects.filter(activity=cur_activity).delete()
            models.Budget.objects.filter(activity=cur_activity).delete()
            models.Condition.objects.filter(activity=cur_activity).delete()

            models.ActivitySearchData.objects.filter(activity=cur_activity).delete()

            for r in models.Result.objects.filter(activity=cur_activity):
                for ri in models.ResultIndicator.objects.filter(result=r):
                    models.ResultIndicatorPeriod.objects.filter(result_indicator=ri).delete()
                    ri.delete()
                r.delete()

            for f in models.Ffs.objects.filter(activity=cur_activity):
                models.FfsForecast.objects.filter(ffs=f).delete()
                f.delete()

            for c in models.CrsAdd.objects.filter(activity=cur_activity):
                models.CrsAddLoanStatus.objects.filter(crs_add=c)
                models.CrsAddLoanTerms.objects.filter(crs_add=c)
                c.delete()

            cur_activity.delete()

        except Exception as e:
            exception_handler(e, cur_activity.id, "remove_values_for_activity")
示例#9
0
    def update_publisher(self, publisher_iati_id, publisher_abbreviation, publisher_name):

        try:
            #   if already in the database, get the publisher_id, else add the publisher
            if (models.Publisher.objects.filter(org_id=publisher_iati_id).exists()):
                current_publisher = models.Publisher.objects.get(org_id=publisher_iati_id)
            else:
                # get the abbreviation from organisation_identifier table
                if(OrganisationIdentifier.objects.filter(code=publisher_iati_id).exists()):
                    current_publisher_meta = OrganisationIdentifier.objects.get(code=publisher_iati_id)
                    publisher_abbreviation = current_publisher_meta.abbreviation

                current_publisher = self.add_publisher_to_db(publisher_iati_id, publisher_abbreviation, publisher_name)

            return current_publisher
            return current_publisher

        except Exception as e:
            exception_handler(e, publisher_iati_id, "dataset_syncer.update_publisher")
示例#10
0
    def parse_url(self, source):
        """
        Parses the source with url
        """
        url = source.source_url
        xml_source_ref = source.ref
        # last_hash = source.last_hash
        
        try:
            file_grabber = FileGrabber()
            iati_file = file_grabber.get_the_file(url)

            if iati_file:

                # delete old activities
                # TODO: determine this in the parser based on last-updated-datetime
                # TODO: also, throw away all narratives
                # try:
                #     deleter = Deleter()
                #     deleter.delete_by_source(xml_source_ref)
                # except Exception as e:
                #     exception_handler(e, "parse url", "delete by source")

                data = iati_file.read()
                root = etree.fromstring(str(data))

                parser = self.prepare_parser(root, source)
                parser.load_and_parse(root)

                # Throw away query logs when in debug mode to prevent memory from overflowing
                if settings.DEBUG:
                    from django import db
                    db.reset_queries()

        except Exception as e:
            exception_handler(e, "parse url", "parse_url")
示例#11
0
    def parse_json_line(self, line, data_type):
        """
        Parse line from IATI response
        """
        try:
            publisher_iati_id = line['extras']['publisher_iati_id']
        except KeyError:
            publisher_iati_id = None

        publisher_abbreviation = ''
        publisher_name = 'Unknown'
        try:
            source_url = str(line['res_url'][0]).replace(' ', '%20')
        except IndexError:
            source_url = ''
        source_name = line.get('name', '')
        source_title = line.get('title', '')

        try:
            data_dict = json.loads(line.get('data_dict', ''))
            publisher_name = data_dict['organization']['title']
        except (ValueError, KeyError):
            pass
        except Exception as e:
            msg = ("Unexpected error in synchronize_with_iati_api_by_page "
                   "organisation match:")
            exception_handler(e, 'synchronize_with_iati_api_by_page', msg)

        if source_url not in self.source_urls:

            if publisher_iati_id:
                current_publisher = self.update_publisher(
                    publisher_iati_id,
                    publisher_abbreviation,
                    publisher_name)
            else:
                current_publisher = self.add_publisher_to_db(
                    'Unknown',
                    publisher_abbreviation,
                    publisher_name)

            self.add_iati_xml_source_to_db(
                source_url,
                source_title,
                source_name,
                current_publisher,
                data_type)

            self.source_urls.append(source_url)

        else:
            msg = "Updated publisher and last found in registry on: "
            exception_handler(None, msg, source_url)

            source = models.IatiXmlSource.objects.get(source_url=source_url)
            source.last_found_in_registry = datetime.datetime.now()

            if source.publisher.org_id != publisher_iati_id:
                new_publisher = self.update_publisher(
                    publisher_iati_id,
                    publisher_abbreviation,
                    publisher_name)
                source.publisher = new_publisher
            source.save(process=False, added_manually=False)
示例#12
0
class DatasetSyncer():

    # Start looping through the datasets
    def synchronize_with_iati_api(self, type):
        if type == 1:
            cur_url = "http://www.iatiregistry.org/api/search/dataset?extras_filetype=activity&all_fields=1&limit=200&offset="

        if type == 2:
            cur_url = "http://www.iatiregistry.org/api/search/dataset?extras_filetype=organisation&all_fields=1&limit=200&offset="

        for i in range(0, 10000, 200):
            cur_url = cur_url.strip().replace(" ", "%20")
            cur_url_with_offset = cur_url + str(i)
            self.synchronize_with_iati_api_by_page(cur_url_with_offset, type)

    # Loop through the datasets by page
    def synchronize_with_iati_api_by_page(self,
                                          cur_url,
                                          cur_type,
                                          try_number=0):
        try:

            req = urllib2.Request(cur_url)
            opener = urllib2.build_opener()
            f = opener.open(req)
            json_objects = json.load(f)

            if not (json_objects is None):

                #   For each dataset object
                for object in json_objects["results"]:

                    try:

                        publisher_iati_id = None
                        publisher_abbreviation = None
                        publisher_name = "Unknown"
                        source_url = str(object["res_url"][0])
                        source_url = source_url.replace(" ", "%20")
                        source_name = object["name"]
                        source_title = None
                        if "title" in object:
                            source_title = object["title"]

                        if "publisher_iati_id" in object["extras"]:
                            publisher_iati_id = object["extras"][
                                "publisher_iati_id"]

                        if "data_dict" in object:
                            try:
                                data_dict = json.loads(object["data_dict"])
                                if "organization" in data_dict and data_dict[
                                        "organization"]:
                                    if "title" in data_dict["organization"]:
                                        publisher_name = data_dict[
                                            "organization"]["title"]
                            except Exception as e:
                                exception_handler(
                                    e, "synchronize_with_iati_api_by_page",
                                    "Unexpected error in synchronize_with_iati_api_by_page organisation match:"
                                )

                        #   If download url is not already in OIPA
                        if not models.IatiXmlSource.objects.filter(
                                source_url=source_url).exists():

                            #   If publisher_iati_id is given
                            if publisher_iati_id and (publisher_iati_id != ""):

                                current_publisher = self.update_publisher(
                                    publisher_iati_id, publisher_abbreviation,
                                    publisher_name)
                            else:

                                # else publisher is unknown
                                current_publisher = self.add_publisher_to_db(
                                    "Unknown", "Unknown", "Unknown")

                            self.add_iati_xml_source_to_db(
                                source_url, source_title, source_name,
                                current_publisher, cur_type)

                        else:
                            exception_handler(
                                None,
                                "Updated publisher and last found in registry on: ",
                                source_url)

                            cursource = models.IatiXmlSource.objects.get(
                                source_url=source_url)
                            cursource.last_found_in_registry = datetime.datetime.now(
                            )
                            current_publisher = cursource.publisher
                            #check if publisher meta is already known, if not, add it and check if the known publisher already existed and add it to the source
                            if (cursource.publisher.org_id !=
                                    publisher_iati_id):
                                new_current_publisher = self.update_publisher(
                                    publisher_iati_id, publisher_abbreviation,
                                    publisher_name)
                                cursource.publisher = new_current_publisher
                            cursource.save(process=False, added_manually=False)

                    except Exception as e:
                        exception_handler(e,
                                          "synchronize_with_iati_api_by_page",
                                          "Unexpected error")

        except urllib2.HTTPError, e:
            exception_handler(e, "HTTP error", cur_url)
            if try_number < 6:
                self.synchronize_with_iati_api_by_page(cur_url, cur_type,
                                                       try_number + 1)
            else:
                return None
        except urllib2.URLError, e:
            exception_handler(e, "URL error", cur_url)
            if try_number < 6:
                self.synchronize_with_iati_api_by_page(cur_url, cur_type,
                                                       try_number + 1)
示例#13
0
                                          "Unexpected error")

        except urllib2.HTTPError, e:
            exception_handler(e, "HTTP error", cur_url)
            if try_number < 6:
                self.synchronize_with_iati_api_by_page(cur_url, cur_type,
                                                       try_number + 1)
            else:
                return None
        except urllib2.URLError, e:
            exception_handler(e, "URL error", cur_url)
            if try_number < 6:
                self.synchronize_with_iati_api_by_page(cur_url, cur_type,
                                                       try_number + 1)
        except httplib.HTTPException, e:
            exception_handler(e, "HTTP Exception", cur_url)
            if try_number < 6:
                self.synchronize_with_iati_api_by_page(cur_url, cur_type,
                                                       try_number + 1)

    def update_publisher(self, publisher_iati_id, publisher_abbreviation,
                         publisher_name):

        try:
            #   if already in the database, get the publisher_id, else add the publisher
            if (models.Publisher.objects.filter(
                    org_id=publisher_iati_id).exists()):
                current_publisher = models.Publisher.objects.get(
                    org_id=publisher_iati_id)
            else:
                # get the abbreviation from organisation_identifier table
示例#14
0
    def synchronize_with_iati_api_by_page(self, cur_url, cur_type, try_number = 0):
        try:

            req = urllib2.Request(cur_url)
            opener = urllib2.build_opener()
            f = opener.open(req)
            json_objects = json.load(f)

            if not (json_objects is None):

                #   For each dataset object
                for object in json_objects["results"]:

                    try:

                        publisher_iati_id = None
                        publisher_abbreviation = None
                        publisher_name = "Unknown"
                        source_url = str(object["res_url"][0])
                        source_url = source_url.replace(" ", "%20")
                        source_name = object["name"]
                        source_title = None
                        if "title" in object:
                            source_title = object["title"]


                        if "publisher_iati_id" in object["extras"]:
                            publisher_iati_id = object["extras"]["publisher_iati_id"]

                        if "data_dict" in object:
                            try:
                                data_dict = json.loads(object["data_dict"])
                                if "organization" in data_dict and data_dict["organization"]:
                                    if "title" in data_dict["organization"]:
                                        publisher_name = data_dict["organization"]["title"]
                            except Exception as e:
                                exception_handler(e, "synchronize_with_iati_api_by_page", "Unexpected error in synchronize_with_iati_api_by_page organisation match:")


                        #   If download url is not already in OIPA
                        if not models.IatiXmlSource.objects.filter(source_url=source_url).exists():

                            #   If publisher_iati_id is given
                            if publisher_iati_id and (publisher_iati_id != ""):

                                current_publisher = self.update_publisher(publisher_iati_id, publisher_abbreviation, publisher_name)
                            else:

                                # else publisher is unknown
                                current_publisher = self.add_publisher_to_db("Unknown", "Unknown", "Unknown")


                            self.add_iati_xml_source_to_db(source_url, source_title, source_name, current_publisher, cur_type)

                        else:
                            exception_handler(None, "Updated publisher and last found in registry on: ", source_url)

                            cursource = models.IatiXmlSource.objects.get(source_url=source_url)
                            cursource.last_found_in_registry = datetime.datetime.now()
                            current_publisher = cursource.publisher
                            #check if publisher meta is already known, if not, add it and check if the known publisher already existed and add it to the source
                            if (cursource.publisher.org_id != publisher_iati_id):
                                new_current_publisher = self.update_publisher(publisher_iati_id, publisher_abbreviation, publisher_name)
                                cursource.publisher = new_current_publisher
                            cursource.save(process=False, added_manually=False)

                    except Exception as e:
                        exception_handler(e, "synchronize_with_iati_api_by_page", "Unexpected error")



        except urllib2.HTTPError, e:
            exception_handler(e, "HTTP error", cur_url)
            if try_number < 6:
                self.synchronize_with_iati_api_by_page(cur_url, cur_type,try_number + 1)
            else:
                return None
示例#15
0
                        exception_handler(e, "synchronize_with_iati_api_by_page", "Unexpected error")



        except urllib2.HTTPError, e:
            exception_handler(e, "HTTP error", cur_url)
            if try_number < 6:
                self.synchronize_with_iati_api_by_page(cur_url, cur_type,try_number + 1)
            else:
                return None
        except urllib2.URLError, e:
            exception_handler(e, "URL error", cur_url)
            if try_number < 6:
                self.synchronize_with_iati_api_by_page(cur_url, cur_type,try_number + 1)
        except httplib.HTTPException, e:
            exception_handler(e, "HTTP Exception", cur_url)
            if try_number < 6:
                self.synchronize_with_iati_api_by_page(cur_url, cur_type,try_number + 1)


    def update_publisher(self, publisher_iati_id, publisher_abbreviation, publisher_name):

        try:
            #   if already in the database, get the publisher_id, else add the publisher
            if (models.Publisher.objects.filter(org_id=publisher_iati_id).exists()):
                current_publisher = models.Publisher.objects.get(org_id=publisher_iati_id)
            else:
                # get the abbreviation from organisation_identifier table
                if(OrganisationIdentifier.objects.filter(code=publisher_iati_id).exists()):
                    current_publisher_meta = OrganisationIdentifier.objects.get(code=publisher_iati_id)
                    publisher_abbreviation = current_publisher_meta.abbreviation