class IbanType(PropertyType): name = "iban" group = "ibans" label = _("IBAN") plural = _("IBANs") matchable = True pivot = True def validate(self, text, **kwargs): text = sanitize_text(text) try: return iban.validate(text) except ValidationError: return False def clean_text(self, text, **kwargs): """Create a more clean, but still user-facing version of an instance of the type.""" return text.replace(" ", "").upper() def country_hint(self, value): return value[:2].lower() def rdf(self, value): return URIRef(self.node_id(value)) def node_id(self, value): return f"iban:{value.upper()}" def caption(self, value): return iban.format(value)
class EntityType(PropertyType): ID_RE = re.compile(r'^[0-9a-zA-Z]([0-9a-zA-Z\.\-]*[0-9a-zA-Z])?$') name = 'entity' group = 'entities' label = _('Entity') plural = _('Entities') matchable = True pivot = True def validate(self, text, **kwargs): text = sanitize_text(text) if text is None: return False return self.ID_RE.match(text) is not None def clean(self, text, **kwargs): entity_id = get_entity_id(text) if self.validate(entity_id): return entity_id def rdf(self, value): return URIRef('entity:%s' % value) def caption(self, value): return None
class IpType(PropertyType): """Internet protocol addresses. This supports both addresses used by the protocol versions 4 (e.g. ``192.168.1.143``) and 6 (e.g. ``0:0:0:0:0:ffff:c0a8:18f``).""" name = "ip" group = "ips" label = _("IP-Address") plural = _("IP-Addresses") matchable = True pivot = True def validate(self, ip, **kwargs): """Check to see if this is a valid ip address.""" try: ip_address(ip) return True except ValueError: return False def clean_text(self, text, **kwargs): """Create a more clean, but still user-facing version of an instance of the type.""" try: return str(ip_address(text)) except ValueError: return None def rdf(self, value): return URIRef("ip:%s" % value)
class IpType(PropertyType): name = "ip" group = "ips" label = _("IP-Address") plural = _("IP-Addresses") matchable = True pivot = True def validate(self, ip, **kwargs): """Check to see if this is a valid ip address.""" try: ip_address(ip) return True except ValueError: return False def clean_text(self, text, **kwargs): """Create a more clean, but still user-facing version of an instance of the type.""" try: return str(ip_address(text)) except ValueError: return None def rdf(self, value): return URIRef("ip:%s" % value)
class IdentifierType(PropertyType): """Used for registration numbers, codes etc.""" COMPARE_CLEAN = re.compile(r'[\W_]+') name = 'identifier' group = 'identifiers' label = _('Identifier') plural = _('Identifiers') matchable = True def normalize(self, text, **kwargs): """Normalize for comparison.""" ids = super(IdentifierType, self).normalize(text, **kwargs) return [normalize(i) for i in ids] def clean_compare(self, value): # TODO: should this be used for normalization? value = self.COMPARE_CLEAN.sub('', value) return value.lower() def compare(self, left, right): left = self.clean_compare(left) right = self.clean_compare(right) specificity = self.specificity(shortest(left, right)) if left == right: return specificity if left in right or right in left: return .8 * specificity return 0 def _specificity(self, value): return dampen(4, 10, value) def node_id(self, value): return 'id:%s' % value
class IdentifierType(PropertyType): """Used for registration numbers, codes etc.""" COMPARE_CLEAN = re.compile(r"[\W_]+") name = "identifier" group = "identifiers" label = _("Identifier") plural = _("Identifiers") matchable = True pivot = True def clean_compare(self, value): # TODO: should this be used for normalization? value = self.COMPARE_CLEAN.sub("", value) return value.lower() def compare(self, left, right): left = self.clean_compare(left) right = self.clean_compare(right) specificity = self.specificity(shortest(left, right)) if left == right: return specificity if left in right or right in left: return 0.8 * specificity return 0 def _specificity(self, value): return dampen(6, 10, value) def node_id(self, value): return "id:%s" % value
class EntityType(PropertyType): ID_RE = re.compile(r"^[0-9a-zA-Z]([0-9a-zA-Z\.\-]*[0-9a-zA-Z])?$") name = "entity" group = "entities" label = _("Entity") plural = _("Entities") matchable = True pivot = True def validate(self, text, **kwargs): text = sanitize_text(text) if text is None: return False return self.ID_RE.match(text) is not None def clean(self, text, **kwargs): entity_id = get_entity_id(text) if entity_id is None: return entity_id = str(entity_id) if self.ID_RE.match(entity_id) is not None: return entity_id def rdf(self, value): return URIRef("entity:%s" % value) def caption(self, value): return None
class AddressType(PropertyType): """A geographic address used to describe a location of a residence or post box. There is no specified order for the sub-parts of an address (e.g. street, city, postal code), and we should consider introducing an Address schema type to retain fidelity in cases where address parts are specified.""" LINE_BREAKS = re.compile(r"(\r\n|\n|<BR/>|<BR>|\t|ESQ\.,|ESQ,|;)") COMMATA = re.compile(r"(,\s?[,\.])") name = "address" group = "addresses" label = _("Address") plural = _("Addresses") matchable = True pivot = True def clean_text(self, address, **kwargs): """Basic clean-up.""" address = self.LINE_BREAKS.sub(", ", address) address = self.COMMATA.sub(", ", address) address = collapse_spaces(address) if len(address): return address def _specificity(self, value: str) -> float: return dampen(10, 60, value) def node_id(self, value: str) -> str: return "addr:%s" % slugify(value)
class MimeType(PropertyType): """A MIME media type are a specification of a content type on a network. Each MIME type is assinged by IANA and consists of two parts: the type and sub-type. Common examples are: ``text/plain``, ``application/json`` and ``application/pdf``. MIME type properties do not contain parameters as used in HTTP headers, like ``charset=UTF-8``.""" name = "mimetype" group = "mimetypes" label = _("MIME-Type") plural = _("MIME-Types") matchable = False def clean_text( self, text: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: text = normalize_mimetype(text) if text != DEFAULT: return text return None def rdf(self, value: str) -> Identifier: return URIRef(f"urn:mimetype:{value}") def caption(self, value: str) -> str: return parse_mimetype(value).label or value
class NameType(PropertyType): name = 'name' group = 'names' label = _('Name') plural = _('Names') matchable = True def clean_text(self, name, **kwargs): """Basic clean-up.""" name = strip_quotes(name) name = collapse_spaces(name) return name def pick(self, values): values = [sanitize_text(v) for v in ensure_list(values)] values = [v for v in values if v is not None] if not len(values): return None if 1 == len(values): return values[0] return setmedian(values) def _specificity(self, value): # TODO: insert artificial intelligence here. return dampen(3, 50, value) def compare(self, left, right): return jaro_winkler(left, right)
class UrlType(PropertyType): """A uniform resource locator (URL). This will perform some normalisation on the URL so that it's sure to be using valid encoding/quoting, and to make sure the URL has a schema (e.g. 'http', 'https', ...).""" name = "url" group = "urls" label = _("URL") plural = _("URLs") matchable = True pivot = True def validate(self, url, **kwargs): """Check if `url` is a valid URL.""" return is_valid_url(url) def clean_text(self, url, **kwargs): """Perform intensive care on URLs, see `urlnormalizer`.""" return normalize_url(url, drop_fragments=False) def _specificity(self, value): return dampen(10, 120, value) def rdf(self, value): return URIRef(value) def node_id(self, value): return "url:%s" % value
class AddressType(PropertyType): LINE_BREAKS = re.compile(r'(\r\n|\n|<BR/>|<BR>|\t|ESQ\.,|ESQ,|;)') COMMATA = re.compile(r'(,\s?[,\.])') name = 'address' group = 'addresses' label = _('Address') plural = _('Addresses') matchable = True def clean_text(self, address, **kwargs): """Basic clean-up.""" address = self.LINE_BREAKS.sub(', ', address) address = self.COMMATA.sub(', ', address) address = collapse_spaces(address) if len(address): return address # TODO: normalize well-known parts like "Street", "Road", etc. # TODO: consider using https://github.com/openvenues/pypostal # def normalize(self, address, **kwargs): # """Make the address more compareable.""" # addresses = super(AddressType, self).normalize(address, **kwargs) # return addresses def _specificity(self, value): return dampen(10, 60, value)
class MimeType(PropertyType): """A MIME media type are a specification of a content type on a network. Each MIME type is assinged by IANA and consists of two parts: the type and sub-type. Common examples are: ``text/plain``, ``application/json`` and ``application/pdf``. MIME type properties do not contain parameters as used in HTTP headers, like ``charset=UTF-8``.""" name = "mimetype" group = "mimetypes" label = _("MIME-Type") plural = _("MIME-Types") matchable = False def clean_text(self, text, **kwargs): text = normalize_mimetype(text) if text != DEFAULT: return text def rdf(self, value): return URIRef("urn:mimetype:%s" % value) def caption(self, value): return parse_mimetype(value).label
class NameType(PropertyType): name = "name" group = "names" label = _("Name") plural = _("Names") matchable = True pivot = True def clean_text(self, name, **kwargs): """Basic clean-up.""" name = strip_quotes(name) return collapse_spaces(name) def pick(self, values): values = [sanitize_text(v) for v in ensure_list(values)] values = [v for v in values if v is not None] if len(values) <= 1: return first(values) return setmedian(sorted(values)) def _specificity(self, value): # TODO: insert artificial intelligence here. return dampen(3, 50, value) def compare(self, left, right): return jaro_winkler(left, right) def node_id(self, value): return "name:%s" % slugify(value)
class StringType(PropertyType): name = "string" label = _("Label") plural = _("Labels") matchable = False def node_id(self, value): return None
class EntityType(PropertyType): """A reference to another entity via its ID. This is how entities in FtM become a graph: by pointing at each other using :ref:`references`. Entity IDs can either be `namespaced` or `plain`, depending on the context. When setting properties of this type, you can pass in an entity proxy or dict of the entity, the ID will then be extracted and stored. """ REGEX_RAW = r"^[0-9a-zA-Z]([0-9a-zA-Z\.\-]*[0-9a-zA-Z])?$" REGEX = re.compile(REGEX_RAW) name = "entity" group = "entities" label = _("Entity") plural = _("Entities") matchable = True pivot = True def validate(self, value: str) -> bool: text = sanitize_text(value) if text is None: return False return self.REGEX.match(text) is not None def clean( self, raw: Any, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: entity_id = get_entity_id(raw) if entity_id is None: return None return self.clean_text(entity_id, fuzzy=fuzzy, format=format, proxy=proxy) def clean_text( self, text: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: """Specific types can apply their own cleaning routines here (this is called by ``clean`` after the value has been converted to a string and null values have been filtered).""" if proxy is not None and text == proxy.id: msg = gettext("Self-relationship (%s): %s") raise InvalidData(msg % (proxy.schema, text)) if self.REGEX.match(text) is not None: return text return None def rdf(self, value: str) -> Identifier: return URIRef(f"entity:{value}") def caption(self, value: str) -> None: return None
class CountryType(EnumType): """Properties to define countries and territories. This is completely descriptive and needs to deal with data from many origins, so we support a number of unusual and controversial designations (e.g. the Soviet Union, Transnistria, Somaliland, Kosovo).""" name = "country" group = "countries" label = _("Country") plural = _("Countries") matchable = True def _locale_names(self, locale): # extra territories that OCCRP is interested in. names = { "zz": gettext("Global"), "eu": gettext("European Union"), # Overwrite "Czechia" label: "cz": gettext("Czech Republic"), "xk": gettext("Kosovo"), "yucs": gettext("Yugoslavia"), "csxx": gettext("Serbia and Montenegro"), "suhh": gettext("Soviet Union"), "ge-ab": gettext("Abkhazia"), "x-so": gettext("South Ossetia"), "so-som": gettext("Somaliland"), "gb-wls": gettext("Wales"), "gb-sct": gettext("Scotland"), "gb-nir": gettext("Northern Ireland"), "md-pmr": gettext("Transnistria"), } for code, label in locale.territories.items(): code = code.lower() if code in names: continue try: int(code) except ValueError: names[code] = label return names def clean_text(self, country, fuzzy=False, **kwargs): """Determine a two-letter country code based on an input. The input may be a country code, a country name, etc. """ code = country.lower().strip() if code in self.codes: return code country = countrynames.to_code(country, fuzzy=fuzzy) if country is not None: return country.lower() def country_hint(self, value: str) -> str: return value def rdf(self, value: str) -> Identifier: return URIRef("iso-3166-1:%s" % value)
class TextType(StringType): """Longer text fragments, such as descriptions or document text. Unlike string properties, it might make sense to treat properties of this type as full-text search material.""" name = "text" label = _("Text") plural = _("Texts") max_size = 30 * MEGABYTE
class PhoneType(PropertyType): name = "phone" group = "phones" label = _("Phone number") plural = _("Phone numbers") matchable = True pivot = True def _clean_countries(self, proxy): yield None if proxy is not None: for country in proxy.countries: yield country.upper() def _parse_number(self, number, proxy=None): """Parse a phone number and return in international format. If no valid phone number can be detected, None is returned. If a country code is supplied, this will be used to infer the prefix. https://github.com/daviddrysdale/python-phonenumbers """ for code in self._clean_countries(proxy): try: yield parse_number(number, code) except NumberParseException: pass def clean_text(self, number, proxy=None, **kwargs): for num in self._parse_number(number, proxy=proxy): if is_valid_number(num): return format_number(num, PhoneNumberFormat.E164) def validate(self, number, proxy=None, **kwargs): for num in self._parse_number(number, proxy=proxy): if is_valid_number(num): return True return False def country_hint(self, value): try: number = parse_number(value) return geocoder.region_code_for_number(number).lower() except NumberParseException: pass def _specificity(self, value): # TODO: insert artificial intelligence here. return dampen(6, 11, value) def rdf(self, value): return URIRef("tel:%s" % value) def caption(self, value): number = parse_number(value) return format_number(number, PhoneNumberFormat.INTERNATIONAL)
class HTMLType(PropertyType): name = "html" label = _("HTML") plural = _("HTMLs") matchable = False max_size = 30 * MEGABYTE def node_id(self, value): return None
class TextType(PropertyType): name = "text" label = _("Text") plural = _("Texts") matchable = False max_size = 30 * MEGABYTE def node_id(self, value): return None
class DateType(PropertyType): """A date or time stamp. This is based on ISO 8601, but meant to allow for different degrees of precision by specifying a prefix. This means that ``2021``, ``2021-02``, ``2021-02-16``, ``2021-02-16T21``, ``2021-02-16T21:48`` and ``2021-02-16T21:48:52`` are all valid values, with an implied precision. The timezone is always expected to be UTC and cannot be specified otherwise. There is no support for calendar weeks (``2021-W7``) and date ranges (``2021-2024``).""" name = "date" group = "dates" label = _("Date") plural = _("Dates") matchable = True def validate(self, value: str) -> bool: """Check if a thing is a valid date.""" prefix = parse(value) return prefix.precision != Precision.EMPTY def clean_text( self, text: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: """The classic: date parsing, every which way.""" if format is not None: return parse_format(text, format).text return parse(text).text def _specificity(self, value: str) -> float: return dampen(5, 13, value) def compare(self, left: str, right: str) -> float: prefix = os.path.commonprefix([left, right]) return dampen(4, 10, prefix) def rdf(self, value: str) -> Identifier: return Literal(value, datatype=XSD.dateTime) def node_id(self, value: str) -> str: return f"date:{value}" def to_datetime(self, value: str) -> Optional[datetime]: return parse(value).dt def to_number(self, value: str) -> Optional[float]: date = self.to_datetime(value) if date is None: return None # We make a best effort all over the app to ensure all times are in UTC. if date.tzinfo is None: date = date.replace(tzinfo=timezone.utc) return date.timestamp()
class LanguageType(PropertyType): name = 'language' group = 'languages' label = _('Language') plural = _('Languages') matchable = False # Language whitelist LANGUAGES = [ 'eng', 'fra', 'deu', 'rus', 'spa', 'nld', 'ron', 'kat', 'ara', 'tur', 'ltz', 'ell', 'lit', 'ukr', 'zho', 'bel', 'bul', 'bos', 'jpn', 'ces', 'lav', 'por', 'pol', 'hye', 'hrv', 'hin', 'heb', 'uzb', 'mon', 'urd', 'sqi', 'kor', 'isl', 'ita', 'est', 'nor', 'fas', 'swa', 'slv', 'slk', 'aze', 'tgk', 'kaz', 'tuk', 'kir', 'hun', 'dan', 'afr', 'swe', 'srp', 'ind', 'kan', 'mkd', 'mlt', 'msa', 'fin', 'cat' ] LANGUAGES = get_env_list('FTM_LANGUAGES', LANGUAGES) LANGUAGES = [l.lower().strip() for l in LANGUAGES] def __init__(self, *args): self._names = {} @property def names(self): locale = get_locale() if locale not in self._names: self._names[locale] = {} for lang in self.LANGUAGES: self._names[locale][lang] = lang for code, label in locale.languages.items(): code = iso_639_alpha3(code) if code in self.LANGUAGES: self._names[locale][code] = label return self._names[locale] def validate(self, text, **kwargs): text = sanitize_text(text) if text is None: return False return text in self.LANGUAGES def clean_text(self, text, **kwargs): code = iso_639_alpha3(text) if code in self.LANGUAGES: return code def rdf(self, value): return URIRef('iso-639:%s' % value) def caption(self, value): return self.names.get(value, value) def to_dict(self): data = super(LanguageType, self).to_dict() data['values'] = self.names return data
class ChecksumType(PropertyType): """Used for content hashes, usually SHA1 (I know, I know).""" name = 'checksum' group = 'checksums' label = _('Checksum') plural = _('Checksums') matchable = True def rdf(self, value): return URIRef('hash:%s' % value)
class StringType(PropertyType): """A simple string property with no additional semantics.""" name = "string" label = _("Label") plural = _("Labels") matchable = False def node_id(self, value): return None
class GenderType(EnumType): """A human gender. This is not meant to be a comprehensive model of the social realities of gender but a way to capture data from (mostly) government databases and represent it in a way that can be used by structured tools. I'm not sure this justifies the simplification.""" MALE = "male" FEMALE = "female" OTHER = "other" LOOKUP = { "m": MALE, "man": MALE, "masculin": MALE, "männlich": MALE, "мужской": MALE, "f": FEMALE, "woman": FEMALE, "féminin": FEMALE, "weiblich": FEMALE, "женский": FEMALE, "o": OTHER, "d": OTHER, "divers": OTHER, } name = "gender" group = "genders" label = _("Gender") plural = _("Genders") matchable = False def _locale_names(self, locale: Locale) -> EnumValues: return { self.MALE: gettext("male"), self.FEMALE: gettext("female"), self.OTHER: gettext("other"), } def clean_text( self, text: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: code = text.lower().strip() code = self.LOOKUP.get(code, code) if code not in self.codes: return None return code def rdf(self, value: str) -> Identifier: return URIRef(f"gender:{value}")
class ChecksumType(PropertyType): """Used for content hashes, usually SHA1 (I know, I know).""" name = "checksum" group = "checksums" label = _("Checksum") plural = _("Checksums") matchable = True def rdf(self, value): return URIRef("hash:%s" % value)
class UrlType(PropertyType): """A uniform resource locator (URL). This will perform some normalisation on the URL so that it's sure to be using valid encoding/quoting, and to make sure the URL has a schema (e.g. 'http', 'https', ...).""" SCHEMES = ("http", "https", "ftp", "mailto") DEFAULT_SCHEME = "http" name = "url" group = "urls" label = _("URL") plural = _("URLs") matchable = True pivot = True def clean_text( self, text: str, fuzzy: bool = False, format: Optional[str] = None, proxy: Optional["EntityProxy"] = None, ) -> Optional[str]: """Perform intensive care on URLs to make sure they have a scheme and a host name. If no scheme is given HTTP is assumed.""" try: parsed = urlparse(text) except (TypeError, ValueError): return None if not len(parsed.netloc): if "." in parsed.path and not text.startswith("//"): # This is a pretty weird rule meant to catch things like # 'www.google.com', but it'll likely backfire in some # really creative ways. return self.clean_text(f"//{text}") return None if not len(parsed.scheme): parsed = parsed._replace(scheme=self.DEFAULT_SCHEME) else: parsed = parsed._replace(scheme=parsed.scheme.lower()) if parsed.scheme not in self.SCHEMES: return None if not len(parsed.path): parsed = parsed._replace(path="/") return parsed.geturl() def _specificity(self, value: str) -> float: return dampen(10, 120, value) def rdf(self, value: str) -> Identifier: return URIRef(value) def node_id(self, value: str) -> Optional[str]: return f"url:{value}"
class EntityType(PropertyType): name = 'entity' group = 'entities' label = _('Entity') plural = _('Entities') matchable = True def clean(self, text, **kwargs): return get_entity_id(text) def rdf(self, value): return URIRef('urn:entity:%s' % value)
class HTMLType(StringType): """Properties that contain raw hypertext markup (HTML). User interfaces rendering properties of this type need to take extreme care not to allow attacks such as cross-site scripting. It is recommended to perform server-side sanitisation, or to not render this property at all. """ name = "html" label = _("HTML") plural = _("HTMLs") max_size = 30 * MEGABYTE