Python _extract_with_regex示例，metext.plugins.extractors._extract_with_regex Python示例

示例#1

0

显示文件

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts (standard) padded Base32 strings.

        See https://tools.ietf.org/html/rfc4648#section-4

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :keyword min_len: Minimum length of base32 found strings,
        defaults to 25
        :return: Generator of Base32 strings
        """
        def validate(value):
            if len(value) < min_len:
                return False

            counter = Counter(value)
            thresh = len(value) * 0.6
            if any(x for x, c in counter.most_common() if c > thresh
                   if x != "="):
                return False

            return Base32Validator.run(value)

        min_len = kwargs.get("min_len", 25)
        yield from _extract_with_regex(
            _input,
            RE_BASE32,
            validator=validate,
            per_line=True,
            preprocess=lambda val: val.replace(r"\r\n", "").replace(r"\n", "").
            replace(r"\r", ""),
            data_kind=Base32Extractor.PLUGIN_NAME,
        )

示例#2

0

显示文件

文件： json.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts JSON.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :return: Generator of JSON strings
        """
        def validate(data):
            try:
                d = json.loads(data)
            except:
                return False
            if len(d) == 0:
                return False
            if isinstance(d, list) and len(d) < 2:
                return False
            return True

        yield from _extract_with_regex(
            _input,
            RE_JSON,
            validator=validate,
            per_line=False,
            data_kind=JsonExtractor.PLUGIN_NAME,
        )

示例#3

0

显示文件

文件： crypto.py 项目： espoem/MetExt

 def run(cls, _input: str, **kwargs) -> Iterable[dict]:
     yield from _extract_with_regex(
         _input,
         RE_BTC_PRIVKEY,
         validator=BitcoinPrivKeyValidator.run,
         data_kind=BitcoinPrivateKey.PLUGIN_NAME,
     )

示例#4

0

显示文件

文件： crypto.py 项目： espoem/MetExt

 def run(cls, _input: str, **kwargs) -> Iterable[dict]:
     yield from _extract_with_regex(
         _input,
         RE_BTC_WIF,
         validator=BitcoinWifValidator.run,
         data_kind=BitcoinWif.PLUGIN_NAME,
     )

示例#5

0

显示文件

文件： crypto.py 项目： espoem/MetExt

 def run(cls, _input: str, **kwargs) -> Iterable[dict]:
     yield from _extract_with_regex(
         _input,
         RE_BIP32_XKEY,
         validator=slip132.address_from_xkey if has_btclib else None,
         data_kind=Bip32XKey.PLUGIN_NAME,
     )

示例#6

0

显示文件

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts (standard) padded Base64 strings.

        See https://tools.ietf.org/html/rfc4648#section-4

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :keyword min_len: Minimum length of base64 found string,
        defaults to 25
        :return: Generator of Base64 strings
        """
        def validate(value):
            if len(value) < min_len:
                return False
            parts = value.split("/")
            if any(x for x in parts
                   if len(x) > 4 and x in (x.upper(),
                                           x.lower()) and "=" not in x):
                return False

            return Base64Validator.run(value, strict=True)

        min_len = kwargs.get("min_len", 25)
        yield from _extract_with_regex(
            _input,
            RE_BASE64,
            validator=validate,
            per_line=False,
            postprocess=(lambda val: re.sub("\r\n|\n|\r", "", val).replace(
                r"\r\n", "").replace(r"\n", "").replace(r"\r", "")),
            data_kind=Base64Extractor.PLUGIN_NAME,
        )

示例#7

0

显示文件

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts strings that conform to the MD5 hash string.

        :param _input: String or a list of strings to extract MD5 hash string from
        :param kwargs: Arbitrary keyword arguments
        :return: Generator of MD5 hash strings
        """
        yield from _extract_with_regex(
            _input, RE_MD5, per_line=True, data_kind=MD5Extractor.PLUGIN_NAME
        )

示例#8

0

显示文件

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts strings that is in accordance with the SHA-512 hash string.

        :param _input: String or a list of strings to extract SHA-512 hash string from
        :param kwargs: Arbitrary keyword arguments
        :return: Generator of SHA-512 hash strings
        """
        yield from _extract_with_regex(
            _input, RE_SHA512, per_line=True, data_kind=SHA512Extractor.PLUGIN_NAME
        )

示例#9

0

显示文件

文件： pem.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts PEM objects delimited by header `-----BEGIN <label>-----`
        and trailer `-----END <label>-----`.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :return: Generator of PEM objects strings
        """
        yield from _extract_with_regex(_input,
                                       RE_PEM,
                                       per_line=False,
                                       data_kind=PemExtractor.PLUGIN_NAME)

示例#10

0

显示文件

文件： uri.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts URNs from a string or a list of strings.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :return: Generator with URNs
        """
        yield from _extract_with_regex(
            _input,
            RE_URN,
            data_kind=URNExtractor.PLUGIN_NAME,
        )

示例#11

0

显示文件

文件： crypto.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid Cardano (ADA) addresses from a string or a list of strings.

        :param _input: String or a list of strings
        :return: Generator of formally valid Cardano addresses
        """
        yield from _extract_with_regex(
            _input,
            RE_DOT,
            validator=PolkadotValidator.run,
            data_kind=PolkadotAddress.PLUGIN_NAME,
        )

示例#12

0

显示文件

文件： crypto.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid Chainlink (LINK) addresses from a string or a list of strings.

        :param _input: String or a list of strings
        :return: Generator of formally valid chainlink addresses
        """
        yield from _extract_with_regex(
            _input,
            RE_LINK,
            validator=ChainlinkValidator.run,
            data_kind=ChainlinkAddress.PLUGIN_NAME,
        )

示例#13

0

显示文件

文件： crypto.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid Tether (USDT) addresses from a string or a list of strings.

        :param _input: String or a list of strings
        :return: Generator of formally valid Tether addresses
        """
        yield from _extract_with_regex(
            _input,
            RE_USDT,
            validator=TetherValidator.run,
            data_kind=TetherAddress.PLUGIN_NAME,
        )

示例#14

0

显示文件

文件： uri.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid data URIs from a string or a lists of strings.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :return: Generator with data URIs
        """
        yield from _extract_with_regex(
            _input,
            RE_DATA_URI,
            validator=DataURIValidator.run,
            data_kind=DataURIExtractor.PLUGIN_NAME,
        )

示例#15

0

显示文件

文件： crypto.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid Ethereum (ETH) addresses from a string or a list of strings.

        Looks for legacy addresses and EIP-55 addresses.

        :param _input: String or a list of strings
        :return: Generator of found valid Ethereum addresses
        """
        yield from _extract_with_regex(
            _input,
            RE_ETH,
            validator=EthereumValidator.run,
            data_kind=EthereumAddressExtractor.PLUGIN_NAME,
        )

示例#16

0

显示文件

文件： crypto.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid Ripple (XRP) addresses from a string or a list of strings.

        See: https://xrpl.org/accounts.html#addresses

        :param _input: String or a list of strings to extract Ripple addresses from
        :return: Generator of found valid Ripple addresses
        """
        yield from _extract_with_regex(
            _input,
            RE_XRP,
            validator=RippleValidator.run,
            data_kind=RippleAddress.PLUGIN_NAME,
        )

示例#17

0

显示文件

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts e-mail addresses from a string or a list of strings.

        :param _input: String or a list of strings to extract e-mail addresses from
        :param kwargs: Arbitrary keyword arguments
        :return: Generator of e-mail addresses
        """
        yield from _extract_with_regex(
            _input,
            RE_EMAIL,
            validator=EmailValidator.run,
            per_line=True,
            data_kind=EmailExtractor.PLUGIN_NAME,
        )

示例#18

0

显示文件

文件： issn.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid ISSN identifiers
        from a string or a lists of strings.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :return: Generator with ISSN identifiers
        """
        yield from _extract_with_regex(
            _input,
            RE_ISSN,
            validator=IssnValidator.run,
            cached_values=IssnExtractor.valid_issns,
            data_kind=IssnExtractor.PLUGIN_NAME,
        )

示例#19

0

显示文件

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts sequences of hex strings, where each two hex chars are separated by
        a selected delimiter.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :keyword delim: Delimiter separating 2-digit hex representation of a byte,
        can be regex pattern string. Defaults to empty string ("")
        :return: Generator of hex-representation strings
        """
        delim = kwargs.get("delim", "")
        regex = re.compile(HEX_PATTERN_TEMPLATE.format(delim=delim),
                           re.IGNORECASE)
        yield from _extract_with_regex(_input,
                                       regex,
                                       data_kind=HexExtractor.PLUGIN_NAME)

示例#20

0

显示文件

文件： crypto.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid Bitcoin Cash (BCH) addresses from a string or a list of strings.

        :param _input: String or a list of strings
        :keyword include_legacy: Flag to include legacy addresses
        conforming to BTC address format. Defaults to True
        :return: Generator of formally valid Bitcoin Cash addresses
        """
        include_legacy = kwargs.get("include_legacy", True)
        re_ = RE_BCH_WITH_LEGACY if include_legacy else RE_BCH
        yield from _extract_with_regex(
            _input,
            re_,
            validator=BitcoinCashValidator.run,
            data_kind=BitcoinCashAddress.PLUGIN_NAME,
        )

示例#21

0

显示文件

文件： uri.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts URIs from a string or a list of strings.

        See https://tools.ietf.org/html/rfc3986

        .. warning::
            This method does no filtering on specific schemes. Therefore, it may return
            lots of noise patterns.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :keyword strict: Flag to reduce the number of results,
        if True then only path-like results with "/" path parts delimiter are returned.
        Defaults to False
        :keyword relative: Flag to allow URI relative references,
        otherwise some scheme must be present, default to False
        :keyword schemes: List of lower-cased schemes (e.g. http, data) URI must contain.
        If empty list (not provided), then URI is not restricted by a scheme,
        defaults to registered schemes
        :return: Generator of URIs
        """
        strict = kwargs.get("strict", False)
        schemes = kwargs.get(
            "schemes", URI_SCHEMES
        )  # https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
        re_uri = re.compile(
            r"\b(?=(?:{}):){}\b".format(
                "|".join(
                    set(
                        schemes
                        + [s.upper() for s in schemes]
                        + [s.lower() for s in schemes]
                    )
                ),
                URI,
            ),
            re.VERBOSE,
        )
        yield from _extract_with_regex(
            _input,
            re_uri,
            preprocess=lambda x: ""
            if re.search(r"(?:[a-z]{2}|[A-Z]{2}):", x) is None
            else x,
            validator=lambda val: URIValidator.run(val, strict=strict, schemes=schemes),
            data_kind=URIExtractor.PLUGIN_NAME,
        )

示例#22

0

显示文件

文件： doi.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid DOI identifiers
        from a string or a lists of strings.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :return: Generator with DOI identifiers
        """
        yield from _extract_with_regex(
            _input,
            RE_DOI,
            validator=DoiValidator.run,
            per_line=True,
            data_kind=DoiExtractor.PLUGIN_NAME,
            postprocess=lambda x: x.split("(")[0]
            if "(" in x and ")" not in x else x,
        )

示例#23

0

显示文件

文件： crypto.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid Litecoin addresses from a string or a list of strings.

        Looks for addresses that start with 'M', 'L', or '3' char.

        .. warning::
            An address starting with '3' may represent a Bitcoin address.

        :param _input: String or a list of strings
        :return: Generator of found valid Litecoin addresses
        """
        yield from _extract_with_regex(
            _input,
            RE_LTC,
            validator=LitecoinValidator.run,
            data_kind=LitecoinAddress.PLUGIN_NAME,
        )

示例#24

0

显示文件

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts MAC addresses

        :param _input: String or a list of strings to extract MAC address from
        :param kwargs: Arbitrary keyword arguments
        :return: Generator of MAC addresses
        """
        for mac in _extract_with_regex(
                _input, RE_MAC, data_kind=MACAddressExtractor.PLUGIN_NAME):
            try:
                info = EUI(mac["value"]).info
                if info:
                    mac.update(
                        {"info": (json.loads(str(info).replace("'", '"')))})
            except:
                pass
            yield mac

示例#25

0

显示文件

文件： ip.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extract IPv6 addresses strings from a string or a list of strings.

        See https://tools.ietf.org/html/rfc3986#section-3.2.2 for the form of IPv6 address

        :param _input: String or a list of strings to extract IPv6 addresses from
        :param kwargs: Arbitrary keyword arguments
        :return: Generator of IPv6 addresses
        """
        yield from _extract_with_regex(
            _input,
            RE_IPV6,
            validator=lambda val: len(val) > 6 and IPv6AddressValidator.run(val
                                                                            ),
            per_line=True,
            data_kind=IPv6AddressExtractor.PLUGIN_NAME,
        )

示例#26

0

显示文件

文件： uri.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts URLs from a string or a list of strings.
        URL must contain one of the following schemes:
        - `http`, `https`, `ftp`

        See https://tools.ietf.org/html/rfc3986

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :return: Generator with URLs
        """
        yield from _extract_with_regex(
            _input,
            RE_URL,
            validator=URLValidator.run,
            data_kind=URLExtractor.PLUGIN_NAME,
        )

示例#27

0

显示文件

文件： crypto.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts valid Bitcoin addresses from a string or a list of strings.

        Looks for addresses on mainnet:
        - base58-encoded : must confirm to pattern /[13][a-km-zA-HJ-NP-Z1-9]{25,34}/
        - segwit (bech32-encoded) : must confirm to pattern /(?:[bB][cC])1[a-zA-HJ-NP-Z0-9]{25,39}/

        See:
        - https://en.bitcoin.it/wiki/Address

        :param _input: String or a list of strings to extract Bitcoin addresses from
        :return: Generator of found valid Bitcoin addresses
        """
        yield from _extract_with_regex(
            _input,
            RE_BTC,
            validator=BitcoinValidator.run,
            data_kind=BitcoinAddress.PLUGIN_NAME,
        )

示例#28

0

显示文件

文件： ip.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extract IPv4 addresses strings from a string or a list of strings.

        See https://tools.ietf.org/html/rfc3986#section-3.2.2 for the form of IPv4 address

        :param _input: String or a list of strings to extract IPv4 addresses from
        :param kwargs: Arbitrary keyword arguments
        :return: Generator of IPv4 addresses
        """
        def validate(value):
            first = value.split(".")
            return IPv4AddressValidator.run(value) and (first == "0"
                                                        or len(first) > 1)

        yield from _extract_with_regex(
            _input,
            RE_IPV4,
            validator=validate,
            per_line=True,
            data_kind=IPv4AddressExtractor.PLUGIN_NAME,
        )

示例#29

0

显示文件

文件： uri.py 项目： espoem/MetExt

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts form fields data in HTTP, URL.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :keyword min_len: Minimum length of extracted pattern.
        Defaults to 20.
        :keyword decode: Flag to percent (URL) decode the found pattern. Defaults to True
        :return: Generator of form fields in decoded format
        """
        min_len = kwargs.get("min_len", 20)
        decode = kwargs.get("decode", True)

        yield from _extract_with_regex(
            _input,
            RE_URL_FORM_FIELDS,
            validator=lambda val: len(val) >= min_len
            and not ("&" not in val and val.endswith("=")),
            postprocess=lambda val: unquote_plus(val) if decode else val,
            data_kind=FormFieldsExtractor.PLUGIN_NAME,
        )

示例#30

0

显示文件

    def run(cls, _input: str, **kwargs) -> Iterable[dict]:
        """Extracts GUIDs strings.

        :param _input: String or a list of strings
        :param kwargs: Arbitrary keyword arguments
        :return: Generator of GUID strings
        """
        def validate(value):
            if value.count("-") == 4:
                return True
            value = value.replace("-", "")
            return value[12] in {"0", "1", "2", "3", "4", "5"}

        def get_info(value):
            try:
                res = uuid.UUID(value)
                return {"variant": res.variant, "version": res.version}
            except:
                return None

        def normalize(value):
            value = value.replace(r"-", "").lower()
            return "{}-{}-{}-{}-{}".format(value[:8], value[8:12],
                                           value[12:16], value[16:20],
                                           value[20:])

        for obj in _extract_with_regex(
                _input,
                RE_GUID,
                per_line=True,
                data_kind=UuidExtractor.PLUGIN_NAME,
                validator=validate,
                postprocess=normalize,
        ):
            info = get_info(obj["value"])
            if not info or info["version"] is None:
                continue
            obj.update({"info": info})
            yield obj