def get_xml_activity_amount(self, url): try: file_grabber = FileGrabber() xml_file = file_grabber.get_the_file(url) occurences = 0 for line in xml_file: if "</iati-identifier>" in line: amount = line.count("</iati-identifier>") occurences += amount del xml_file gc.collect() return occurences except Exception as e: if e.args: print(e.args[0]) print("ERROR IN GET_XML_ACTIVITY_AMOUNT, FILE URL " + url)
def update_activities_count(self): # This module to give us imformation the count activity in database # and in the XML try: # Activity count in the XML file_grabber = FileGrabber() response = file_grabber.get_the_file(self.source_url) # Parse to XML tree tree = etree.fromstring(response.content) count = len(tree.getchildren()) self.activities_count_in_xml = count - 1 if count > 0 else count # Activity count in the Database self.activities_count_in_database = self.activity_set.all().count() self.save(process=False) except Exception as e: logger.error(e)
def parse_url(self, source): """ Parses the source with url """ url = source.source_url xml_source_ref = source.ref # last_hash = source.last_hash try: file_grabber = FileGrabber() iati_file = file_grabber.get_the_file(url) if iati_file: # delete old activities # TODO: determine this in the parser based on last-updated-datetime # TODO: also, throw away all narratives # try: # deleter = Deleter() # deleter.delete_by_source(xml_source_ref) # except Exception as e: # exception_handler(e, "parse url", "delete by source") data = iati_file.read() root = etree.fromstring(str(data)) parser = self.prepare_parser(root, source) parser.load_and_parse(root) # Throw away query logs when in debug mode to prevent memory from overflowing if settings.DEBUG: from django import db db.reset_queries() except Exception as e: exception_handler(e, "parse url", "parse_url")
def __init__(self, dataset, root=None, force_reparse=False): """ Given a IATI dataset, prepare an IATI parser """ if settings.IATI_PARSER_DISABLED: raise ParserDisabledError( "The parser is disabled on this instance of OIPA") self.dataset = dataset self.url = dataset.source_url self.force_reparse = force_reparse self.hash_changed = True self.valid_dataset = True if root is not None: self.root = root self.parser = self._prepare_parser(self.root, dataset) return file_grabber = FileGrabber() response = file_grabber.get_the_file(self.url) from iati_synchroniser.models import DatasetNote if not response or response.status_code != 200: self.valid_dataset = False note = DatasetNote( dataset=self.dataset, iati_identifier="n/a", model="n/a", field="n/a", message="Cannot access the URL", exception_type='UrlError', line_number=None ) note.save() self.dataset.note_count = 1 # If not a XML file them sha1 should blank self.dataset.sha1 = '' self.dataset.save() return # 1. Turn bytestring into string (treat it using specified encoding): try: iati_file = smart_text(response.content, 'utf-8') # XXX: some files contain non utf-8 characters: # FIXME: this is hardcoded: except UnicodeDecodeError: iati_file = smart_text(response.content, 'latin-1') # 2. Encode the string to use for hashing: hasher = hashlib.sha1() hasher.update(iati_file.encode('utf-8')) sha1 = hasher.hexdigest() if dataset.sha1 == sha1: # dataset did not change, no need to reparse normally self.hash_changed = False else: dataset.sha1 = sha1 # Save a sha1 in the first time of the process parse dataset.save() try: self.root = etree.fromstring(response.content) self.parser = self._prepare_parser(self.root, dataset) if settings.ERROR_LOGS_ENABLED: self.xsd_validate() # TODO: when moving error messages to frontend, create a separate error # for wrong file type: except etree.XMLSyntaxError as e: self.valid_dataset = False DatasetNote.objects.filter(dataset=self.dataset).delete() note = DatasetNote( dataset=self.dataset, iati_identifier="n/a", model="n/a", field="n/a", message="This file contains XML syntax errors or it's not an " "XML file", exception_type='XMLSyntaxError', line_number=None ) note.save() self.dataset.note_count = 1 # If not the XML should not have a sha1 self.dataset.sha1 = '' self.dataset.save() return
def __init__(self, dataset, root=None, force_reparse=False): """ Given a IATI dataset, prepare an IATI parser """ if settings.IATI_PARSER_DISABLED: raise ParserDisabledError( "The parser is disabled on this instance of OIPA") self.dataset = dataset self.url = dataset.source_url self.force_reparse = force_reparse self.hash_changed = True self.valid_dataset = True if root is not None: self.root = root self.parser = self._prepare_parser(self.root, dataset) return file_grabber = FileGrabber() response = file_grabber.get_the_file(self.url) from iati_synchroniser.models import DatasetNote if not response or response.code != 200: self.valid_dataset = False DatasetNote.objects.filter(dataset=self.dataset).delete() note = DatasetNote(dataset=self.dataset, iati_identifier="n/a", model="n/a", field="n/a", message="URL down or does not exist", exception_type='UrlError', line_number=None) note.save() self.dataset.note_count = 1 self.dataset.save() return iati_file = response.read() iati_file_str = str(iati_file) hasher = hashlib.sha1() hasher.update(iati_file_str) sha1 = hasher.hexdigest() if dataset.sha1 == sha1: # dataset did not change, no need to reparse normally self.hash_changed = False else: dataset.sha1 = sha1 try: self.root = etree.fromstring(iati_file_str) self.parser = self._prepare_parser(self.root, dataset) except etree.XMLSyntaxError as e: self.valid_dataset = False DatasetNote.objects.filter(dataset=self.dataset).delete() note = DatasetNote(dataset=self.dataset, iati_identifier="n/a", model="n/a", field="n/a", message="This file contains XML syntax errors", exception_type='XMLSyntaxError', line_number=None) note.save() self.dataset.note_count = 1 self.dataset.save() return
def __init__(self, dataset, root=None, force_reparse=False): """ Given a IATI dataset, prepare an IATI parser """ if settings.IATI_PARSER_DISABLED: raise ParserDisabledError( "The parser is disabled on this instance of OIPA") self.dataset = dataset self.url = dataset.source_url self.force_reparse = force_reparse self.hash_changed = True self.valid_dataset = True if root is not None: self.root = root self.parser = self._prepare_parser(self.root, dataset) return file_grabber = FileGrabber() response = file_grabber.get_the_file(self.url) from iati_synchroniser.models import DatasetNote if not response or response.status_code != 200: self.valid_dataset = False note = DatasetNote(dataset=self.dataset, iati_identifier="n/a", model="n/a", field="n/a", message="Cannot access the URL", exception_type='UrlError', line_number=None) note.save() self.dataset.note_count = 1 # If not a XML file them sha1 should blank self.dataset.sha1 = '' self.dataset.save() return # 1. Turn bytestring into string (treat it using specified encoding): try: iati_file = smart_text(response.content, 'utf-8') # XXX: some files contain non utf-8 characters: # FIXME: this is hardcoded: except UnicodeDecodeError: iati_file = smart_text(response.content, 'latin-1') # 2. Encode the string to use for hashing: hasher = hashlib.sha1() hasher.update(iati_file.encode('utf-8')) sha1 = hasher.hexdigest() if dataset.sha1 == sha1: # dataset did not change, no need to reparse normally self.hash_changed = False else: dataset.sha1 = sha1 # Save a sha1 in the first time of the process parse dataset.save() try: parser = etree.XMLParser(huge_tree=True) tree = etree.parse(BytesIO(response.content), parser) self.root = tree.getroot() self.parser = self._prepare_parser(self.root, dataset) if settings.ERROR_LOGS_ENABLED: self.xsd_validate() # TODO: when moving error messages to frontend, create a separate error # for wrong file type: except etree.XMLSyntaxError as e: self.valid_dataset = False DatasetNote.objects.filter(dataset=self.dataset).delete() note = DatasetNote( dataset=self.dataset, iati_identifier="n/a", model="n/a", field="n/a", message="This file contains XML syntax errors or it's not an " "XML file", exception_type='XMLSyntaxError', line_number=None) note.save() self.dataset.note_count = 1 # If not the XML should not have a sha1 self.dataset.sha1 = '' self.dataset.save() return