def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts (standard) padded Base32 strings. See https://tools.ietf.org/html/rfc4648#section-4 :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :keyword min_len: Minimum length of base32 found strings, defaults to 25 :return: Generator of Base32 strings """ def validate(value): if len(value) < min_len: return False counter = Counter(value) thresh = len(value) * 0.6 if any(x for x, c in counter.most_common() if c > thresh if x != "="): return False return Base32Validator.run(value) min_len = kwargs.get("min_len", 25) yield from _extract_with_regex( _input, RE_BASE32, validator=validate, per_line=True, preprocess=lambda val: val.replace(r"\r\n", "").replace(r"\n", ""). replace(r"\r", ""), data_kind=Base32Extractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts JSON. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :return: Generator of JSON strings """ def validate(data): try: d = json.loads(data) except: return False if len(d) == 0: return False if isinstance(d, list) and len(d) < 2: return False return True yield from _extract_with_regex( _input, RE_JSON, validator=validate, per_line=False, data_kind=JsonExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: yield from _extract_with_regex( _input, RE_BTC_PRIVKEY, validator=BitcoinPrivKeyValidator.run, data_kind=BitcoinPrivateKey.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: yield from _extract_with_regex( _input, RE_BTC_WIF, validator=BitcoinWifValidator.run, data_kind=BitcoinWif.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: yield from _extract_with_regex( _input, RE_BIP32_XKEY, validator=slip132.address_from_xkey if has_btclib else None, data_kind=Bip32XKey.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts (standard) padded Base64 strings. See https://tools.ietf.org/html/rfc4648#section-4 :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :keyword min_len: Minimum length of base64 found string, defaults to 25 :return: Generator of Base64 strings """ def validate(value): if len(value) < min_len: return False parts = value.split("/") if any(x for x in parts if len(x) > 4 and x in (x.upper(), x.lower()) and "=" not in x): return False return Base64Validator.run(value, strict=True) min_len = kwargs.get("min_len", 25) yield from _extract_with_regex( _input, RE_BASE64, validator=validate, per_line=False, postprocess=(lambda val: re.sub("\r\n|\n|\r", "", val).replace( r"\r\n", "").replace(r"\n", "").replace(r"\r", "")), data_kind=Base64Extractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts strings that conform to the MD5 hash string. :param _input: String or a list of strings to extract MD5 hash string from :param kwargs: Arbitrary keyword arguments :return: Generator of MD5 hash strings """ yield from _extract_with_regex( _input, RE_MD5, per_line=True, data_kind=MD5Extractor.PLUGIN_NAME )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts strings that is in accordance with the SHA-512 hash string. :param _input: String or a list of strings to extract SHA-512 hash string from :param kwargs: Arbitrary keyword arguments :return: Generator of SHA-512 hash strings """ yield from _extract_with_regex( _input, RE_SHA512, per_line=True, data_kind=SHA512Extractor.PLUGIN_NAME )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts PEM objects delimited by header `-----BEGIN <label>-----` and trailer `-----END <label>-----`. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :return: Generator of PEM objects strings """ yield from _extract_with_regex(_input, RE_PEM, per_line=False, data_kind=PemExtractor.PLUGIN_NAME)
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts URNs from a string or a list of strings. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :return: Generator with URNs """ yield from _extract_with_regex( _input, RE_URN, data_kind=URNExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid Cardano (ADA) addresses from a string or a list of strings. :param _input: String or a list of strings :return: Generator of formally valid Cardano addresses """ yield from _extract_with_regex( _input, RE_DOT, validator=PolkadotValidator.run, data_kind=PolkadotAddress.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid Chainlink (LINK) addresses from a string or a list of strings. :param _input: String or a list of strings :return: Generator of formally valid chainlink addresses """ yield from _extract_with_regex( _input, RE_LINK, validator=ChainlinkValidator.run, data_kind=ChainlinkAddress.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid Tether (USDT) addresses from a string or a list of strings. :param _input: String or a list of strings :return: Generator of formally valid Tether addresses """ yield from _extract_with_regex( _input, RE_USDT, validator=TetherValidator.run, data_kind=TetherAddress.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid data URIs from a string or a lists of strings. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :return: Generator with data URIs """ yield from _extract_with_regex( _input, RE_DATA_URI, validator=DataURIValidator.run, data_kind=DataURIExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid Ethereum (ETH) addresses from a string or a list of strings. Looks for legacy addresses and EIP-55 addresses. :param _input: String or a list of strings :return: Generator of found valid Ethereum addresses """ yield from _extract_with_regex( _input, RE_ETH, validator=EthereumValidator.run, data_kind=EthereumAddressExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid Ripple (XRP) addresses from a string or a list of strings. See: https://xrpl.org/accounts.html#addresses :param _input: String or a list of strings to extract Ripple addresses from :return: Generator of found valid Ripple addresses """ yield from _extract_with_regex( _input, RE_XRP, validator=RippleValidator.run, data_kind=RippleAddress.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts e-mail addresses from a string or a list of strings. :param _input: String or a list of strings to extract e-mail addresses from :param kwargs: Arbitrary keyword arguments :return: Generator of e-mail addresses """ yield from _extract_with_regex( _input, RE_EMAIL, validator=EmailValidator.run, per_line=True, data_kind=EmailExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid ISSN identifiers from a string or a lists of strings. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :return: Generator with ISSN identifiers """ yield from _extract_with_regex( _input, RE_ISSN, validator=IssnValidator.run, cached_values=IssnExtractor.valid_issns, data_kind=IssnExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts sequences of hex strings, where each two hex chars are separated by a selected delimiter. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :keyword delim: Delimiter separating 2-digit hex representation of a byte, can be regex pattern string. Defaults to empty string ("") :return: Generator of hex-representation strings """ delim = kwargs.get("delim", "") regex = re.compile(HEX_PATTERN_TEMPLATE.format(delim=delim), re.IGNORECASE) yield from _extract_with_regex(_input, regex, data_kind=HexExtractor.PLUGIN_NAME)
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid Bitcoin Cash (BCH) addresses from a string or a list of strings. :param _input: String or a list of strings :keyword include_legacy: Flag to include legacy addresses conforming to BTC address format. Defaults to True :return: Generator of formally valid Bitcoin Cash addresses """ include_legacy = kwargs.get("include_legacy", True) re_ = RE_BCH_WITH_LEGACY if include_legacy else RE_BCH yield from _extract_with_regex( _input, re_, validator=BitcoinCashValidator.run, data_kind=BitcoinCashAddress.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts URIs from a string or a list of strings. See https://tools.ietf.org/html/rfc3986 .. warning:: This method does no filtering on specific schemes. Therefore, it may return lots of noise patterns. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :keyword strict: Flag to reduce the number of results, if True then only path-like results with "/" path parts delimiter are returned. Defaults to False :keyword relative: Flag to allow URI relative references, otherwise some scheme must be present, default to False :keyword schemes: List of lower-cased schemes (e.g. http, data) URI must contain. If empty list (not provided), then URI is not restricted by a scheme, defaults to registered schemes :return: Generator of URIs """ strict = kwargs.get("strict", False) schemes = kwargs.get( "schemes", URI_SCHEMES ) # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml re_uri = re.compile( r"\b(?=(?:{}):){}\b".format( "|".join( set( schemes + [s.upper() for s in schemes] + [s.lower() for s in schemes] ) ), URI, ), re.VERBOSE, ) yield from _extract_with_regex( _input, re_uri, preprocess=lambda x: "" if re.search(r"(?:[a-z]{2}|[A-Z]{2}):", x) is None else x, validator=lambda val: URIValidator.run(val, strict=strict, schemes=schemes), data_kind=URIExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid DOI identifiers from a string or a lists of strings. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :return: Generator with DOI identifiers """ yield from _extract_with_regex( _input, RE_DOI, validator=DoiValidator.run, per_line=True, data_kind=DoiExtractor.PLUGIN_NAME, postprocess=lambda x: x.split("(")[0] if "(" in x and ")" not in x else x, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid Litecoin addresses from a string or a list of strings. Looks for addresses that start with 'M', 'L', or '3' char. .. warning:: An address starting with '3' may represent a Bitcoin address. :param _input: String or a list of strings :return: Generator of found valid Litecoin addresses """ yield from _extract_with_regex( _input, RE_LTC, validator=LitecoinValidator.run, data_kind=LitecoinAddress.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts MAC addresses :param _input: String or a list of strings to extract MAC address from :param kwargs: Arbitrary keyword arguments :return: Generator of MAC addresses """ for mac in _extract_with_regex( _input, RE_MAC, data_kind=MACAddressExtractor.PLUGIN_NAME): try: info = EUI(mac["value"]).info if info: mac.update( {"info": (json.loads(str(info).replace("'", '"')))}) except: pass yield mac
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extract IPv6 addresses strings from a string or a list of strings. See https://tools.ietf.org/html/rfc3986#section-3.2.2 for the form of IPv6 address :param _input: String or a list of strings to extract IPv6 addresses from :param kwargs: Arbitrary keyword arguments :return: Generator of IPv6 addresses """ yield from _extract_with_regex( _input, RE_IPV6, validator=lambda val: len(val) > 6 and IPv6AddressValidator.run(val ), per_line=True, data_kind=IPv6AddressExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts URLs from a string or a list of strings. URL must contain one of the following schemes: - `http`, `https`, `ftp` See https://tools.ietf.org/html/rfc3986 :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :return: Generator with URLs """ yield from _extract_with_regex( _input, RE_URL, validator=URLValidator.run, data_kind=URLExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts valid Bitcoin addresses from a string or a list of strings. Looks for addresses on mainnet: - base58-encoded : must confirm to pattern /[13][a-km-zA-HJ-NP-Z1-9]{25,34}/ - segwit (bech32-encoded) : must confirm to pattern /(?:[bB][cC])1[a-zA-HJ-NP-Z0-9]{25,39}/ See: - https://en.bitcoin.it/wiki/Address :param _input: String or a list of strings to extract Bitcoin addresses from :return: Generator of found valid Bitcoin addresses """ yield from _extract_with_regex( _input, RE_BTC, validator=BitcoinValidator.run, data_kind=BitcoinAddress.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extract IPv4 addresses strings from a string or a list of strings. See https://tools.ietf.org/html/rfc3986#section-3.2.2 for the form of IPv4 address :param _input: String or a list of strings to extract IPv4 addresses from :param kwargs: Arbitrary keyword arguments :return: Generator of IPv4 addresses """ def validate(value): first = value.split(".") return IPv4AddressValidator.run(value) and (first == "0" or len(first) > 1) yield from _extract_with_regex( _input, RE_IPV4, validator=validate, per_line=True, data_kind=IPv4AddressExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts form fields data in HTTP, URL. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :keyword min_len: Minimum length of extracted pattern. Defaults to 20. :keyword decode: Flag to percent (URL) decode the found pattern. Defaults to True :return: Generator of form fields in decoded format """ min_len = kwargs.get("min_len", 20) decode = kwargs.get("decode", True) yield from _extract_with_regex( _input, RE_URL_FORM_FIELDS, validator=lambda val: len(val) >= min_len and not ("&" not in val and val.endswith("=")), postprocess=lambda val: unquote_plus(val) if decode else val, data_kind=FormFieldsExtractor.PLUGIN_NAME, )
def run(cls, _input: str, **kwargs) -> Iterable[dict]: """Extracts GUIDs strings. :param _input: String or a list of strings :param kwargs: Arbitrary keyword arguments :return: Generator of GUID strings """ def validate(value): if value.count("-") == 4: return True value = value.replace("-", "") return value[12] in {"0", "1", "2", "3", "4", "5"} def get_info(value): try: res = uuid.UUID(value) return {"variant": res.variant, "version": res.version} except: return None def normalize(value): value = value.replace(r"-", "").lower() return "{}-{}-{}-{}-{}".format(value[:8], value[8:12], value[12:16], value[16:20], value[20:]) for obj in _extract_with_regex( _input, RE_GUID, per_line=True, data_kind=UuidExtractor.PLUGIN_NAME, validator=validate, postprocess=normalize, ): info = get_info(obj["value"]) if not info or info["version"] is None: continue obj.update({"info": info}) yield obj