def ele(self): if self._ele is None: pk = Parker(xml_tostring=_tostring, element=ElementTree.Element) for ele in pk.etree(self._json_data): self.parse_json(self.config_node_parent, ele) self._ele = self.root(self.config_node_parent) return self._ele
def get_crime_data(): """ Fetches XML data, converts to JSON using Parker syntax """ print('Getting crime data...') r = requests.get(url) bf = Parker(dict_type=dict) obj = bf.data(fromstring(r.text)) return obj
def __init__(self, schema_file, domain_file=None, report_directory=None, es_url=None, es_index=None, es_region=None, api_token=None): """Construct a Parser instance. Parameters ---------- schema_file : str The name of the file containing the XML schema defining a DMARC aggregate report. domain_file : str The name of the file to which a list of the domains encountered while parsing DMARC aggregate reports should be saved, or None if no such file is to be saved. report_directory : str The name of the directory to which XML files containing the DMARC aggregate reports encountered while parsing DMARC aggregate reports should be saved, or None if no such files are to be saved. es_url : str A URL corresponding to an AWS Elasticsearch instance where DMARC aggregate reports should be written. es_index : str The index to use when writing the DMARC aggregate reports to Elasticsearch. api_token : str The Dmarcian API token. """ self.schema = etree.XMLSchema(file=schema_file) if domain_file is not None: self.domains = open(domain_file, 'w') else: self.domains = None self.report_directory = report_directory self.es_url = es_url self.es_index = es_index self.es_region = es_region # We don't care about order of dictionary elements here, so we can use # a simple dict instead of the default OrderedDict self.parker = Parker(dict_type=dict) if api_token is not None: self.api_headers = { Parser.__DmarcianHeaderName: Parser.__DmarcianHeaderValue.format(api_token) } else: self.api_headers = None
def get_prices(date): html = requests.get(URL + date + '/').text i = html.find(PREFIX) if i < 0: return None link = html[i:i + PATH_LEN] r = requests.get(URL + date + '/' + link) xml = gzip.decompress(r.content) parker = Parker(dict_type=dict) return parker.data(fromstring(xml))
def parse_value(self, origin, value, tag): def convert_xml_to_lxml(xml_element, lxml_parent=None, default_ns=''): ns_name, tag = self.convert_tag(default_ns, xml_element.tag, src=ns_spec[origin]['val_name'], dst=Tag.LXML_ETREE) val_name_ns_tuple = self.convert_ns( ns_name, src=ns_spec[origin]['val_name'][0]) nsmap = {None: val_name_ns_tuple[Tag.NAMESPACE]} val_name_ns = val_name_ns_tuple[ns_spec[origin]['val_val'][0]] if xml_element.text is not None: ns_val, text = self.convert_tag(val_name_ns, xml_element.text, src=ns_spec[origin]['val_val'], dst=Tag.JSON_PREFIX) if ns_val != val_name_ns: v_v_ns = self.convert_ns(ns_val, src=ns_spec[origin]['val_val'][0]) v_v_prefix = v_v_ns[Tag.PREFIX] v_v_url = v_v_ns[Tag.NAMESPACE] nsmap[v_v_prefix] = v_v_url if lxml_parent is None: lxml_element = etree.Element(tag, nsmap=nsmap) else: lxml_element = etree.SubElement(lxml_parent, tag, nsmap=nsmap) if xml_element.text is not None: lxml_element.text = text for xml_child in xml_element: convert_xml_to_lxml(xml_child, lxml_parent=lxml_element, default_ns=ns_name) return lxml_element n, t = self.convert_tag('', tag, src=Tag.LXML_ETREE, dst=ns_spec[origin]['val_name']) json_val_str = '{{"{}": {}}}'.format(t, value.json_ietf_val.decode()) json_data = json.loads(json_val_str, object_pairs_hook=OrderedDict) pk = Parker(xml_tostring=_tostring, element=ElementTree.Element) return [convert_xml_to_lxml(i) for i in pk.etree(json_data)]
def get_json_instance(node): pk = Parker(xml_fromstring=_fromstring, dict_type=OrderedDict) default_ns = {} nodes = [node] + node.findall('.//') for item in nodes: parents = [ p for p in node.findall('.//{}/..'.format(item.tag)) if item in p.findall('*') ] if parents and id(parents[0]) in default_ns: default_url = default_ns[id(parents[0])] ns, tag = self.device.convert_tag(default_url, item.tag, dst=Tag.JSON_NAME) else: ns, tag = self.device.convert_tag('', item.tag, dst=Tag.JSON_NAME) default_ns[id(item)] = ns item.tag = tag return pk.data(node, preserve_root=True)
def xml2json(element, conv='parker'): """Takes an XML record and returns the json representation of it.""" if conv == 'bf': convention = BadgerFish(xml_fromstring=str) elif conv == 'parker': convention = Parker(xml_fromstring=str) else: logging.critical('Invalid XML2JSON Convention: ' + conv) raise ValueError('The parameter @conv should be "bf" or "parker" not ' + conv) data = convention.data(element) return json.dumps(data, indent=' ', ensure_ascii=False)
def get_json_instance(node): pk = Parker(xml_fromstring=_fromstring, dict_type=OrderedDict) default_ns = {} for item in node.iter(): parents = [p for p in node.iter() if item in p] if parents and id(parents[0]) in default_ns: ns, tag = self.device.convert_tag(default_ns[id(parents[0])], item.tag, dst=ns_spec[origin]['val_name']) else: ns, tag = self.device.convert_tag('', item.tag, dst=ns_spec[origin]['val_name']) default_ns[id(item)] = ns item.tag = tag if item.text: text = self.device.convert_tag(self._url_to_prefix[ns], item.text, src=Tag.JSON_PREFIX, dst=ns_spec[origin]['val_val'])[1] item.text = text return pk.data(node)
class Parser: """Class that handles the verification and parsing of DMARC aggregate reports. Attributes ---------- schema : str The name of the file containing the XML schema defining a DMARC aggregate report. domains : io.FileIO The file object to which a list of the domains encountered while parsing DMARC aggregate reports should be saved, or None if no such file is to be saved. report_directory : str The name of the directory to which XML files containing the DMARC aggregate reports encountered while parsing DMARC aggregate reports should be saved, or None if no such files are to be saved. es_url : str The Elasticsearch index where the DMARC aggregate reports should be written. es_index : str The index to use when writing the DMARC aggregate reports to Elasticsearch. es_region : str The AWS region where the Elasticsearch instance is located. parker : xmljson.Parker Converts XML to JSON using the Parker convention. Since the aggregate report XSD does not define any attributes we can use this convention to simplify the JSON without losing any information. api_headers : dict The Dmarcian API authentication header. """ """The URL for the Dmarcian API call that retrieves the bulk mail-sending organization (if any) associated with an IP. """ __DmarcianApiUrl = 'https://dmarcian.com/api/v1/find/source/{}' """The name of the authentication header required by the Dmarcian API""" __DmarcianHeaderName = 'Authorization' """The value of the authentication header required by the Dmarcian API""" __DmarcianHeaderValue = 'Token {}' """The timeout in seconds to use when retrieving API data""" __Timeout = 300 """The payload to use when creating the Elasticsearch index where DMARC aggregate reports are stored. """ __IndexPayload = { 'mappings': { '_doc': { 'properties': { 'policy_published': { 'properties': { 'adkim': {'type': 'text'}, 'aspf': {'type': 'text'}, 'domain': {'type': 'text'}, 'fo': {'type': 'long'}, 'p': {'type': 'text'}, 'pct': {'type': 'long'}, 'sp': {'type': 'text'} } }, 'record': { 'properties': { 'auth_results': { 'properties': { 'dkim': { 'properties': { 'domain': {'type': 'text'}, 'human_result': {'type': 'text'}, 'result': {'type': 'text'}, 'selector': {'type': 'text'} } }, 'spf': { 'properties': { 'domain': {'type': 'text'}, 'result': {'type': 'text'}, 'scope': {'type': 'text'} } } } }, 'identifiers': { 'properties': { 'envelope_from': {'type': 'text'}, 'envelope_to': {'type': 'text'}, 'header_from': {'type': 'text'} } }, 'row': { 'properties': { 'count': {'type': 'long'}, 'policy_evaluated': { 'properties': { 'disposition': {'type': 'text'}, 'dkim': {'type': 'text'}, 'reason': { 'properties': { 'comment': {'type': 'text'}, 'type': {'type': 'text'} } }, 'spf': {'type': 'text'} } }, 'source_ip': {'type': 'text'} } } } }, 'report_metadata': { 'properties': { 'date_range': { 'properties': { 'begin': {'type': 'long'}, 'end': {'type': 'long'} } }, 'email': {'type': 'text'}, 'error': {'type': 'text'}, 'extra_contact_info': {'type': 'text'}, 'org_name': {'type': 'text'}, 'report_id': {'type': 'text'} } }, 'version': {'type': 'float'} } } } } def __init__(self, schema_file, domain_file=None, report_directory=None, es_url=None, es_index=None, es_region=None, api_token=None): """Construct a Parser instance. Parameters ---------- schema_file : str The name of the file containing the XML schema defining a DMARC aggregate report. domain_file : str The name of the file to which a list of the domains encountered while parsing DMARC aggregate reports should be saved, or None if no such file is to be saved. report_directory : str The name of the directory to which XML files containing the DMARC aggregate reports encountered while parsing DMARC aggregate reports should be saved, or None if no such files are to be saved. es_url : str A URL corresponding to an AWS Elasticsearch instance where DMARC aggregate reports should be written. es_index : str The index to use when writing the DMARC aggregate reports to Elasticsearch. api_token : str The Dmarcian API token. """ self.schema = etree.XMLSchema(file=schema_file) if domain_file is not None: self.domains = open(domain_file, 'w') else: self.domains = None self.report_directory = report_directory self.es_url = es_url self.es_index = es_index self.es_region = es_region # We don't care about order of dictionary elements here, so we can use # a simple dict instead of the default OrderedDict self.parker = Parker(dict_type=dict) if api_token is not None: self.api_headers = { Parser.__DmarcianHeaderName: Parser.__DmarcianHeaderValue.format(api_token) } else: self.api_headers = None def pp_validation_error(self, tree): """Pretty-print a validation error to the error log. Parameters ---------- tree : etree.Element The XML element that caused the error. """ logging.error(self.schema.error_log) line_num = 2 # Dunno, it lines up with error messages for line in etree.tostring(tree).decode().splitlines(): logging.error('{}\t{}'.format(line_num, line)) line_num += 1 def process_message(self, message): """Process a (possibly multipart) email message containing one or more DMARC aggregate reports. Parameters ---------- message : email.message.EmailMessage The email message to be processed. Returns ------- bool: True if the message was parsed successfully and False otherwise. """ # The binascii.Error and AssertionError that appear below are raised if # the payload contains a non-base64 digit. We'll catch the exceptions # here since we want to process any other message parts, but we'll log # them and set success to False so that the message isn't deleted. success = True if message.is_multipart(): # Loop through message parts for part in message.get_payload(): try: success &= self.process_payload(part.get_content_type(), part.get_payload(decode=True)) except (binascii.Error, AssertionError) as e: logging.error('Unable to process a multipart message payload', e) success = False continue else: # This isn't a multipart message try: success = self.process_payload(message.get_content_type(), message.get_payload(decode=True)) except (binascii.Error, AssertionError) as e: logging.error('Unable to process a non-multipart message payload', e) success = False return success def process_payload(self, content_type, payload): """Process a (possibly compressed) payload containing an DMARC aggregate report. Parameters ---------- content_type : str The content type of the payload. payload : str The (possibly compressed) payload. Returns ------- bool: True if the payload was parsed successfully and False otherwise. """ success = True if payload is not None: decoded_payload = decode_payload(content_type, payload) if decoded_payload is not None: patched_payload = patch_xml(decoded_payload) tree = None try: tree = parse_payload(patched_payload) except etree.XMLSyntaxError as e: pp_parse_error(patched_payload, e) success = False if tree is not None: valid = self.schema.validate(tree) if valid: logging.debug('RUA payload passed schema validation') logging.debug('Report XML is: {}'.format(pp(tree))) domain = tree.find('policy_published').find('domain').text logging.info('Received a report for {}'.format(domain)) # Write the domain to the domains file if necessary if self.domains is not None: self.domains.write('{}\n'.format(domain)) # Write the report to the report directory if necessary if self.report_directory is not None: report_id = tree.find('report_metadata').find('report_id').text with open('{}/{}.xml'.format(self.report_directory, report_id), 'w') as report_file: report_file.write(etree.tostring(tree, pretty_print=True).decode()) # Convert the XML to JSON jsn = self.parker.data(tree) # Find the bulk mail-sending organizations (if any) # associated with the IPs in the report. # # jsn['record'] can be a list if there are multiple # record tags in the XML, or a dict if there is only a # single record tag. Parser.listify() will make sure # that we have a list here. for record in Parser.listify(jsn['record']): if self.api_headers is not None: ip = record['row']['source_ip'] url = Parser.__DmarcianApiUrl.format(ip) try: response = requests.get(url, headers=self.api_headers, timeout=Parser.__Timeout) # Raises an exception if we didn't get back # a 200 code response.raise_for_status() record['row']['source_ip_affiliation'] = response.json()[ip] except requests.exceptions.RequestException: logging.exception('Unable to use the Dmarcian API to determine the affiliation of source IP {}'.format(ip)) # We can't query the Dmarcian API because # of an error, so just add an empty entry record['row']['source_ip_affiliation'] = None success = False else: # We can't query the Dmarcian API because we # don't have a token, so just add an empty # entry logging.debug('json is: {}'.format(jsn)) logging.debug('record is: {}'.format(record)) record['row']['source_ip_affiliation'] = None # Write the report to Elasticsearch if necessary if (self.es_url is not None) and (self.es_region is not None) and (self.es_index is not None): credentials = boto3.Session().get_credentials() awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, self.es_region, 'es', session_token=credentials.token) # Check if the index exists and create it # if necessary index_only_url = '{}/{}'.format(self.es_url, self.es_index) response = requests.head(index_only_url, auth=awsauth, timeout=Parser.__Timeout) if response.status_code != 200: logging.info('The index {} does not exist. Creating it.'.format(self.es_index)) try: response = requests.put(index_only_url, auth=awsauth, json=Parser.__IndexPayload, headers={'Content-Type': 'application/json'}, timeout=Parser.__Timeout) # Raises an exception if we didn't get back a # 200 code response.raise_for_status() except requests.exceptions.RequestException: logging.exception('Unable to create the index {}.'.format(self.es_index)) return False # Now save the report full_url = '{}/_doc'.format(index_only_url) try: response = requests.post(full_url, auth=awsauth, json=jsn, headers={'Content-Type': 'application/json'}, timeout=Parser.__Timeout) # Raises an exception if we didn't get back a # 200 code response.raise_for_status() except requests.exceptions.RequestException: logging.exception('Unable to save the DMARC aggregate report to Elasticsearch') success = False else: logging.error('RUA payload failed schema validation') self.pp_validation_error(tree) success = False else: logging.error('RUA payload failed XML parsing') success = False return success @staticmethod def listify(x): """If x is a list then just return it. If x is a dict then return a list with x as the sole item. Parameters ---------- x : list, dict The list or dict to be listified. Returns ------- list: x if x is a list. If x is a dict then returns a list with x as the sole item. """ retVal = x if isinstance(x, dict): retVal = [x] return retVal
def xml_to_json(data): pk = Parker(dict_type=OrderedDict) converted_json = json.dumps(pk.data(fromstring(data))) return converted_json
def xml2dict(xml_string, encoding="utf-8", dict_type=None): """ Convert an xml string to a python dictionary.""" string = to_unicode(xml_string).encode((encoding)) if dict_type is not None: return Parker(dict_type=dict_type).data(fromstring(string)) return parker.data(fromstring(string))
import re from collections import OrderedDict, MutableMapping from shutil import copyfileobj from mimetypes import guess_type from io import BytesIO, open import requests from xmljson import Parker from lxml.etree import Element, fromstring, tostring from .compat import iteritems, to_unicode, is_py2, str, urlparse, pathname2url parker = Parker(dict_type=dict) class PrettyStringRepr(str): # Useful for debug def __repr__(self): if is_py2: return to_unicode(self.replace(' \n', '\n').strip()).encode('utf-8') else: return to_unicode(self.replace(' \n', '\n').strip()) def camelize_dict(data, uppercase_first_letter=False): """ Returns a dict with camel case keys.