def parse_meta(file_content, cable): """\ Extracts the reference id, date/time of creation, the classification, and the origin of the cable and assigns the value to the provided `cable`. """ end_idx = file_content.rindex("</table>") start_idx = file_content.rindex("<table class='cable'>", 0, end_idx) m = _META_PATTERN.search(file_content, start_idx, end_idx) if not m: raise ValueError('Cable table not found') if len(m.groups()) != 4: raise ValueError('Unexpected metadata result: "%r"' % m.groups()) # Table content: # Reference ID | Created | Classification | Origin ref, created, classification, origin = m.groups() if cable.reference_id != ref: reference_id = MALFORMED_CABLE_IDS.get(ref) if reference_id != cable.reference_id: reference_id = INVALID_CABLE_IDS.get(ref) if reference_id != cable.reference_id: raise ValueError('cable.reference_id != ref. reference_id="%s", ref="%s"' % (cable.reference_id, ref)) cable.created = created cable.origin = origin # classifications are usually written in upper case, but you never know.. cable.classification = classification.upper() # Try to find media IRIs start_idx = file_content.rfind(u'Appears in these', start_idx, end_idx) if start_idx > 0: cable.media_uris = _MEDIA_URLS_PATTERN.findall(file_content, start_idx, end_idx) return cable
def parse_meta(file_content, cable): """\ Extracts the reference id, date/time of creation, the classification, and the origin of the cable and assigns the value to the provided `cable`. """ end_idx = file_content.rindex("</table>") start_idx = file_content.rindex("<table class='cable'>", 0, end_idx) m = _META_PATTERN.search(file_content, start_idx, end_idx) if not m: raise ValueError('Cable table not found') if len(m.groups()) != 4: raise ValueError('Unexpected metadata result: "%r"' % m.groups()) # Table content: # Reference ID | Created | Classification | Origin ref, created, classification, origin = m.groups() if cable.reference_id != ref: reference_id = MALFORMED_CABLE_IDS.get(ref) if reference_id != cable.reference_id: reference_id = INVALID_CABLE_IDS.get(ref) if reference_id != cable.reference_id: raise ValueError( 'cable.reference_id != ref. reference_id="%s", ref="%s"' % (cable.reference_id, ref)) cable.created = created cable.origin = origin # classifications are usually written in upper case, but you never know.. cable.classification = classification.upper() # Try to find media IRIs start_idx = file_content.rfind(u'Appears in these', start_idx, end_idx) if start_idx > 0: cable.media_uris = _MEDIA_URLS_PATTERN.findall(file_content, start_idx, end_idx) return cable
def canonicalize_id(reference_id): """\ Returns the canonicalized form of the provided reference_id. WikiLeaks provides some malformed cable identifiers. If the provided `reference_id` is not valid, this method returns the valid reference identifier equivalent. If the reference identifier is valid, the reference id is returned unchanged. Note: The returned canonicalized identifier may not be a valid WikiLeaks identifier anymore. In most cases the returned canonical form is identical to the WikiLeaks identifier, but for malformed cable identifiers like "09SECTION01OF03SANJOSE525" it is not (becomes "09SANJOSE525"). `reference_id` The cable identifier to canonicalize """ if u'EMBASSY' in reference_id: return reference_id.replace(u'EMBASSY', u'') m = _C14N_PATTERN.match(reference_id) if m: origin = m.group(1) return reference_id.replace(origin, _C14N_FIXES[origin]) return MALFORMED_CABLE_IDS.get(reference_id, INVALID_CABLE_IDS.get(reference_id, reference_id))
def canonicalize_id(reference_id): """\ Returns the canonicalized form of the provided reference_id. WikiLeaks provides some malformed cable identifiers. If the provided `reference_id` is not valid, this method returns the valid reference identifier equivalent. If the reference identifier is valid, the reference id is returned unchanged. Note: The returned canonicalized identifier may not be a valid WikiLeaks identifier anymore. In most cases the returned canonical form is identical to the WikiLeaks identifier, but for malformed cable identifiers like "09SECTION01OF03SANJOSE525" it is not (becomes "09SANJOSE525"). `reference_id` The cable identifier to canonicalize """ if u'EMBASSY' in reference_id: return reference_id.replace(u'EMBASSY', u'') m = _C14N_PATTERN.match(reference_id) if m: origin = m.group(1) return reference_id.replace(origin, _C14N_FIXES[origin]) return MALFORMED_CABLE_IDS.get( reference_id, INVALID_CABLE_IDS.get(reference_id, reference_id))
"""\ This module reports malformed cable ids. """ import os import re from cablemap.core.consts import REFERENCE_ID_PATTERN, MALFORMED_CABLE_IDS, INVALID_CABLE_IDS def find_malformed_ids(in_dir): dct = {} for root, dirs, files in os.walk(in_dir): for name in (n for n in files if '.html' in n): reference_id = name[:name.rindex('.')] if not REFERENCE_ID_PATTERN.match(reference_id): dct[reference_id] = os.path.join(root, name) return dct if __name__ == '__main__': import os, codecs if not os.path.isdir('./cable/'): raise Exception('Expected a directory "cable"') current = set(MALFORMED_CABLE_IDS.keys()) | set(INVALID_CABLE_IDS.keys()) dct = find_malformed_ids('./cable/') s = set(dct.keys()) diff = s ^ current if diff: print('difference: %r' % diff) for ref in diff: print('%s: %s' % (ref, dct.get(ref)))
def _get_test_cases(): return INVALID_CABLE_IDS.iteritems()
def test_c14n_illegal_ids(): def check(incorrect_id, correct_id): eq_(canonicalize_id(correct_id), canonicalize_id(incorrect_id)) for incorrect_id, correct_id in INVALID_CABLE_IDS.iteritems(): yield check, incorrect_id, correct_id
# -*- coding: utf-8 -*- """\ This module reports malformed cable ids. """ import os import re from cablemap.core.consts import REFERENCE_ID_PATTERN, MALFORMED_CABLE_IDS, INVALID_CABLE_IDS def find_malformed_ids(in_dir): dct = {} for root, dirs, files in os.walk(in_dir): for name in (n for n in files if '.html' in n): reference_id = name[:name.rindex('.')] if not REFERENCE_ID_PATTERN.match(reference_id): dct[reference_id] = os.path.join(root, name) return dct if __name__ == '__main__': import os, codecs if not os.path.isdir('./cable/'): raise Exception('Expected a directory "cable"') current = set(MALFORMED_CABLE_IDS.keys()) | set(INVALID_CABLE_IDS.keys()) dct = find_malformed_ids('./cable/') s = set(dct.keys()) diff = s ^ current if diff: print('difference: %r' % diff) for ref in diff: print('%s: %s' % (ref, dct.get(ref)))