Exemplo n.º 1
0
 def __init__(self, allow_missing_http: bool = False):
     if allow_missing_http:
         # reference: https://gist.github.com/dperini/729294, slightly modified to match _ and allow missing "http"
         url_pattern = u"(?:(?:https?|ftp)://)?"\
                 u"(?:\S+(?::\S*)?@)?(?:"\
                 u"(?!(?:10|127)(?:\.\d{1,3}){3})"\
                 u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"\
                 u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"\
                 u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"\
                 u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"\
                 u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|"\
                 u"(?:(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)"\
                 u"(?:\.(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)*"\
                 u"(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?"
     else:
         # reference: https://gist.github.com/dperini/729294, slightly modified to match _
         url_pattern = u"(?:(?:https?|ftp)://)"\
                 "(?:\S+(?::\S*)?@)?(?:"\
                 u"(?!(?:10|127)(?:\.\d{1,3}){3})"\
                 u"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"\
                 u"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"\
                 u"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"\
                 u"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"\
                 u"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|"\
                 u"(?:(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)"\
                 u"(?:\.(?:[a-z\u00a1-\uffff0-9][_-]?)*[a-z\u00a1-\uffff0-9]+)*"\
                 u"(?:\.(?:[a-z\u00a1-\uffff]{2,})))(?::\d{2,5})?(?:/\S*)?"
     RegexExtractor.__init__(self,
                             pattern=url_pattern,
                             extractor_name="url extractor")
Exemplo n.º 2
0
 def __init__(self, support_Bech32: bool=False):
     if support_Bech32:
         # a regex support Bech32 type (which is not supported for most applications)
         bitcoin_address_pattern = r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-zA-HJ-NP-Z0-9]{39}|bc1[a-zA-HJ-NP-Z0-9]{59}\b"
     else:
         # simple version supporting P2PKH and P2SH
         bitcoin_address_pattern = r"\b[13][a-km-zA-HJ-NP-Z1-9]{25,34}\b"
     RegexExtractor.__init__(self, pattern=bitcoin_address_pattern, extractor_name="bitcoin address extractor")
Exemplo n.º 3
0
    def test_match_mode_with_group(self) -> None:
        regexp = RegexExtractor('(.)@(.)', 'test_extractor')
        test_str = 'a@1, b@2, c@3, d@4'

        extractions_with_flag_0 = regexp.extract(test_str, 0, MatchMode.MATCH)
        extractions_with_flag_5 = regexp.extract(test_str, 5, MatchMode.MATCH)

        res_with_flag_0 = [ex.value for ex in extractions_with_flag_0]
        res_with_flag_5 = [ex.value for ex in extractions_with_flag_5]

        expected_res_with_flag_0 = ['a', '1']
        expected_res_with_flag_5 = ['b', '2']
        self.assertEqual(res_with_flag_0, expected_res_with_flag_0)
        self.assertEqual(res_with_flag_5, expected_res_with_flag_5)
Exemplo n.º 4
0
def run(args):
    """
    Args:
        args (argparse.Namespace)
    """
    regex_extractor = RegexExtractor(pattern=args.pattern)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')

        for line in args.input_file:
            extractions = regex_extractor.extract(line)
            for e in extractions:
                print(e.value)
Exemplo n.º 5
0
    def test_search_mode_without_group(self) -> None:
        regexp = RegexExtractor('.@.', 'test_extractor')
        test_str = 'testtesttest, a@1, b@2, c@3, d@4'

        extractions_with_flag_0 = regexp.extract(test_str, 0, MatchMode.SEARCH)
        extractions_with_flag_18 = regexp.extract(test_str, 18,
                                                  MatchMode.SEARCH)

        res_with_flag_0 = [ex.value for ex in extractions_with_flag_0]
        res_with_flag_18 = [ex.value for ex in extractions_with_flag_18]

        expected_res_with_flag_0 = ['a@1']
        expected_res_with_flag_18 = ['b@2']
        self.assertEqual(res_with_flag_0, expected_res_with_flag_0)
        self.assertEqual(res_with_flag_18, expected_res_with_flag_18)
Exemplo n.º 6
0
    def test_split_mode(self) -> None:
        regexp = RegexExtractor(',', 'test_extractor')
        test_str = 'a@1, b@2, c@3, d@4'

        extractions_with_flag_0 = regexp.extract(test_str, 0, MatchMode.SPLIT)
        extractions_with_flag_2 = regexp.extract(test_str, 2, MatchMode.SPLIT)

        res_with_flag_0 = [ex.value for ex in extractions_with_flag_0]
        res_with_flag_2 = [ex.value for ex in extractions_with_flag_2]

        expected_res_with_flag_0 = ['a@1', ' b@2', ' c@3', ' d@4']
        expected_res_with_flag_2 = ['a@1', ' b@2', ' c@3, d@4']

        self.assertEqual(res_with_flag_0, expected_res_with_flag_0)
        self.assertEqual(res_with_flag_2, expected_res_with_flag_2)
Exemplo n.º 7
0
    def test_findall_mode_without_group(self) -> None:
        regexp = RegexExtractor('.@.', 'test_extractor')
        test_str = 'a@1, b@2, c@3, d@4'

        extractions_with_flag_0 = regexp.extract(test_str, 0,
                                                 MatchMode.FINDALL)
        extractions_with_flag_5 = regexp.extract(test_str, 5,
                                                 MatchMode.FINDALL)

        res_with_flag_0 = [ex.value for ex in extractions_with_flag_0]
        res_with_flag_5 = [ex.value for ex in extractions_with_flag_5]

        expected_res_with_flag_0 = ['a@1', 'b@2', 'c@3', 'd@4']
        expected_res_with_flag_5 = ['b@2', 'c@3', 'd@4']

        self.assertEqual(res_with_flag_0, expected_res_with_flag_0)
        self.assertEqual(res_with_flag_5, expected_res_with_flag_5)
Exemplo n.º 8
0
 def __init__(self):
     e_name = 'cryptographic hash extractor'
     self._regex_extractors = [
         RegexExtractor(r"(\b[a-fA-F\d]{32}\b)",
                        'md5 ' + e_name,
                        general_tag='md5'),
         RegexExtractor(r"(\b[0-9a-f]{40}\b)",
                        'sha1 ' + e_name,
                        general_tag='sha1'),
         RegexExtractor(r"(\b[A-Fa-f0-9]{64}\b)",
                        'sha256 ' + e_name,
                        general_tag='sha256'),
     ]
     Extractor.__init__(self,
                        input_type=InputType.TEXT,
                        category="regex",
                        name=e_name)
Exemplo n.º 9
0
 def __init__(self):
     hostname_pattern = r"\b(?:[a-zA-Z0-9](?:[a-zA-Z0-9\-]{,61}[a-zA-Z0-9])?\.)+" \
                        r"(?!html|php|jsp|xml|pdf|asp|css|aspx|phtml)[a-zA-Z]{2,6}\b"
     RegexExtractor.__init__(self,
                             pattern=hostname_pattern,
                             extractor_name="hostname extractor")
Exemplo n.º 10
0
 def __init__(self):
     cve_pattern = r"CVE-(?:\d{4})-(?:\d{4,7})"
     RegexExtractor.__init__(self,
                             pattern=cve_pattern,
                             flags=re.IGNORECASE,
                             extractor_name="cve extractor")
Exemplo n.º 11
0
 def __init__(self):
     cve_pattern = r"CVE-(?:\d{4})-(?:\d{4})"
     RegexExtractor.__init__(self,
                             pattern=cve_pattern,
                             extractor_name="cve extractor")
Exemplo n.º 12
0
 def __init__(self):
     ip_address_pattern = r"(?:(?:[01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5])" \
                          r"[ (?:\[]?(?:\.|dot)[ )\]]?){3}(?:[01]?[0-9]?[0-9]|2[0-4][0-9]|25[0-5])"
     RegexExtractor.__init__(self, pattern=ip_address_pattern, extractor_name="ip address extractor")