def test_scrape(self): nodes = set(s_scrape.scrape(data0)) self.len(9, nodes) nodes.remove(('hash:md5', 'a' * 32)) nodes.remove(('inet:ipv4', '1.2.3.4')) nodes.remove(('inet:ipv4', '5.6.7.8')) nodes.remove(('inet:fqdn', 'WOOT.COM')) nodes.remove(('inet:fqdn', 'hehe.taxi')) nodes.remove(('inet:fqdn', 'vertex.link')) nodes.remove(('inet:server', '5.6.7.8:16')) nodes.remove(('inet:email', '*****@*****.**')) nodes.remove(('inet:email', '*****@*****.**')) self.len(0, nodes) nodes = set(s_scrape.scrape(data0, 'inet:email')) self.len(2, nodes) nodes.remove(('inet:email', '*****@*****.**')) nodes.remove(('inet:email', '*****@*****.**')) self.len(0, nodes)
def test_scrape(self): nodes = dict(s_scrape.scrape(data0)) print(repr(nodes)) nodes.pop(('inet:email', '*****@*****.**')) nodes.pop(('inet:email', '*****@*****.**')) nodes.pop(('inet:fqdn', 'hehe.taxi')) nodes.pop(('inet:ipv4', 0x01020304)) nodes.pop(('inet:ipv4', 0x05060708)) nodes.pop(('inet:tcp4', 0x050607080010)) nodes.pop(('hash:md5', 'a' * 32)) self.len(0, nodes)
async def execStormCmd(self, runt, genr): async for node, path in genr: # type: s_node.Node, s_node.Path # repr all prop vals and try to scrape nodes from them reprs = node.reprs() # make sure any provided props are valid for fprop in self.opts.props: if node.form.props.get(fprop, None) is None: raise s_exc.BadOptValu( mesg=f'{fprop} not a valid prop for {node.ndef[1]}', name='props', valu=self.opts.props) # if a list of props haven't been specified, then default to ALL of them proplist = self.opts.props if not proplist: proplist = [k for k in node.props.keys()] for prop in proplist: val = node.props.get(prop) if val is None: await runt.snap.printf(f'No prop ":{prop}" for {node.ndef}' ) continue # use the repr val or the system mode val as appropriate sval = reprs.get(prop, val) for form, valu in s_scrape.scrape(sval): nnode = await node.snap.addNode(form, valu) npath = path.fork(nnode) yield nnode, npath if self.opts.refs: rnode = await node.snap.addNode( 'edge:refs', (node.ndef, nnode.ndef)) rpath = path.fork(rnode) yield rnode, rpath if self.opts.join: yield node, path
async def __call__(self, text, ptype=None, refang=True, unique=True): text = await s_stormtypes.tostr(text) form = await s_stormtypes.tostr(ptype, noneok=True) refang = await s_stormtypes.tobool(refang) unique = await s_stormtypes.tobool(unique) # Remove this in 3.0.0 since it is deprecated. s_common.deprecated('$lib.scrape()') await self.runt.warnonce( '$lib.scrape() is deprecated. Use $lib.scrape.ndefs().') async with await s_spooled.Set.anit() as items: # type: s_spooled.Set for item in s_scrape.scrape(text, ptype=form, refang=refang, first=False): if unique: if item in items: continue await items.add(item) yield item await asyncio.sleep(0)
def test_scrape_sequential(self): md5 = ('a' * 32, 'b' * 32, ) sha1 = ('c' * 40, 'd' * 40, ) sha256 = ('e' * 64, 'f' * 64, ) url = ('http://foobar.com', 'http://cat.net', ) ipv4 = ('1.2.3.4', '5.6.7.8', ) server = ('7.7.7.7:123', '8.8.8.8:456', ) fqdn = ('woot.com', 'baz.io', ) email = ('*****@*****.**', '*****@*****.**', ) txt = f'hehe {md5[0]} {md5[1]} haha' self.eq({md5[0], md5[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {md5[0]},{md5[1]} haha' self.eq({md5[0], md5[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {sha1[0]} {sha1[1]} haha' self.eq({sha1[0], sha1[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {sha256[0]} {sha256[1]} haha' self.eq({sha256[0], sha256[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {url[0]} {url[1]} haha' self.eq({url[0], 'foobar.com', url[1], 'cat.net', }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {ipv4[0]} {ipv4[1]} haha' self.eq({ipv4[0], ipv4[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {server[0]} {server[1]} haha' self.eq({server[0], '7.7.7.7', server[1], '8.8.8.8', }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe "{fqdn[0]}" "{fqdn[1]}" haha' self.eq({fqdn[0], fqdn[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {fqdn[0]} {fqdn[1]} haha' self.eq({fqdn[0], fqdn[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {email[0]}, {email[1]} haha' self.eq({email[0], 'bar.io', email[1], 'zee.com', }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {fqdn[0]}. {fqdn[1]} haha' self.eq({fqdn[0], fqdn[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {fqdn[0]},{fqdn[1]} haha' self.eq({fqdn[0], fqdn[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {fqdn[0]} {fqdn[1]} haha' self.eq({fqdn[0], fqdn[1], }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {email[0]}. {email[1]} haha' self.eq({email[0], 'bar.io', email[1], 'zee.com', }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {email[0]} {email[1]} haha' self.eq({email[0], 'bar.io', email[1], 'zee.com', }, {n[1] for n in s_scrape.scrape(txt)}) txt = f'hehe {email[0]} {fqdn[0]} haha' self.eq({email[0], 'bar.io', fqdn[0], }, {n[1] for n in s_scrape.scrape(txt)})
def test_refang(self): defanged = '10[.]0[.]0[.]1' refanged = '10.0.0.1' self.eq({refanged}, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'www(.)spam(.)net' refanged = 'www.spam.net' self.eq({refanged}, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'http[:]//foo.faz.com[:]12312/bam' refanged = 'http://foo.faz.com:12312/bam' self.eq({refanged, 'foo.faz.com'}, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'hxxp://foo.faz.edu/' refanged = 'http://foo.faz.edu/' self.eq({refanged, 'foo.faz.edu'}, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'hXXps://foo.faz.edu/' refanged = 'https://foo.faz.edu/' self.eq({refanged, 'foo.faz.edu'}, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'FXP://255.255.255.255' refanged = 'ftp://255.255.255.255' self.eq({refanged, '255.255.255.255'}, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'fxps://255.255.255.255' refanged = 'ftps://255.255.255.255' self.eq({refanged, '255.255.255.255'}, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'foo[at]bar.com' refanged = '*****@*****.**' self.eq({refanged, 'bar.com'}, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'foo[@]bar.com' refanged = '*****@*****.**' self.eq({refanged, 'bar.com'}, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'Im a text BLOB with 255(.)255(.)255.0 and hxxps[:]yowza(.)baz[.]edu/foofaz' exp = { 'yowza.baz.edu', '255.255.255.0' } self.eq(exp, {n[1] for n in s_scrape.scrape(defanged)}) defanged = 'HXXP[:]//example.com?faz=hxxp and im talking about HXXP over here' exp = { 'http://example.com?faz=hxxp', 'example.com' } self.eq(exp, {n[1] for n in s_scrape.scrape(defanged)}) defanged = '''hxxp[://]beep-thing[.]com/beep[.]docx hxxps[://]beep[.]com/beep/gen[.]stuff ''' exp = { 'http://beep-thing.com/beep.docx', 'https://beep.com/beep/gen.stuff', 'beep-thing.com', 'beep.com', } self.eq(exp, {n[1] for n in s_scrape.scrape(defanged)}) # Test scrape without re-fang defanged = 'HXXP[:]//example.com?faz=hxxp and im talking about HXXP over here' self.eq({'example.com'}, {n[1] for n in s_scrape.scrape(defanged, refang=False)})
def test_scrape_basic(self): forms = s_scrape.getForms() self.isin('inet:ipv4', forms) self.isin('crypto:currency:address', forms) self.notin('inet:web:message', forms) with self.raises(s_exc.BadArg): s_scrape.genFangRegex({'hehe': 'haha', 'newp': 'bignope'}) ndefs = list(s_scrape.scrape('log4j vuln CVE-2021-44228 is pervasive')) self.eq(ndefs, (('it:sec:cve', 'CVE-2021-44228'),)) nodes = set(s_scrape.scrape(data0)) self.len(26, nodes) nodes.remove(('hash:md5', 'a' * 32)) nodes.remove(('inet:ipv4', '1.2.3.4')) nodes.remove(('inet:ipv4', '5.6.7.8')) nodes.remove(('inet:fqdn', 'bar.com')) nodes.remove(('inet:fqdn', 'baz.com')) nodes.remove(('inet:fqdn', 'foobar.com')) nodes.remove(('inet:fqdn', 'WOOT.COM')) nodes.remove(('inet:fqdn', 'hehe.taxi')) nodes.remove(('inet:fqdn', 'vertex.link')) nodes.remove(('inet:fqdn', 'vĕrtex.com')) nodes.remove(('inet:fqdn', 'vĕr-tex.link')) nodes.remove(('inet:fqdn', 'faß.de')) nodes.remove(('inet:fqdn', '👁️👄👁️.fm')) nodes.remove(('inet:fqdn', '👁👄👁.com')) nodes.remove(('inet:fqdn', 'foo.bar。baz。lol')) nodes.remove(('inet:fqdn', 'xn--asdf.link')) nodes.remove(('inet:fqdn', 'mcafee.support.customer.com')) nodes.remove(('inet:fqdn', 'pound£.com')) nodes.remove(('inet:fqdn', 'sign1.com')) nodes.remove(('inet:fqdn', 'sign2.com')) nodes.remove(('inet:fqdn', 'sign3.com')) nodes.remove(('inet:fqdn', 'sign4.com')) nodes.remove(('inet:fqdn', 'tilde.com')) nodes.remove(('inet:server', '5.6.7.8:16')) nodes.remove(('inet:email', '*****@*****.**')) nodes.remove(('inet:email', '*****@*****.**')) self.len(0, nodes) nodes = set(s_scrape.scrape(data0, 'inet:email')) self.len(2, nodes) nodes.remove(('inet:email', '*****@*****.**')) nodes.remove(('inet:email', '*****@*****.**')) self.len(0, nodes) nodes = list(s_scrape.scrape(data1)) self.len(10, nodes) for _ in range(5): nodes.remove(('inet:fqdn', 'foo.bar.org')) # URLs should not include any trailing periods or commas. nodes.remove(('inet:url', 'tcp://foo.bar.org:4665/')) nodes.remove(('inet:url', 'tcp://foo.bar.org:4665/')) nodes.remove(('inet:url', 'tcp://foo.bar.org:4665/')) nodes.remove(('inet:url', 'tcp://foo.bar.org:4665/')) nodes.remove(('inet:url', 'tcp://foo.bar.org:4665/,,..a')) nodes = list(s_scrape.scrape(data2)) nodes.remove(('inet:url', 'https://www.foobar.com/things.html')) nodes.remove(('inet:url', 'https://blog.newp.com/scrape/all/the/urls')) nodes.remove(('inet:url', 'https://www.thingspace.com/blog/giggles.html')) nodes.remove(('inet:url', 'https://testme.org/test.php')) nodes.remove(('inet:url', 'https://c2server.com/evil/malware/doesnot[care+]aboutstandards{at-all}')) nodes = list(s_scrape.scrape(btc_addresses)) self.len(11, nodes) nodes.remove(('crypto:currency:address', ('btc', '1BvBMSEYstWetqTFn5Au4m4GFg7xJaNVN2'))) nodes.remove(('crypto:currency:address', ('btc', '16ftSEQ4ctQFDtVZiUBusQUjRrGhM3JYwe'))) nodes.remove(('crypto:currency:address', ('btc', '3279PyBGjZTnu1GNSXamReTj98kiYgZdtW'))) nodes.remove(('crypto:currency:address', ('btc', '3J98t1WpEZ73CNmQviecrnyiWrnqRhWNLy'))) nodes.remove(('crypto:currency:address', ('btc', 'bc1qw508d6qejxtdg4y5r3zarvary0c5xw7kv8f3t4'))) nodes.remove(('crypto:currency:address', ('btc', 'tb1qw508d6qejxtdg4y5r3zarvary0c5xw7kxpjzsx'))) nodes.remove(('crypto:currency:address', ('btc', 'bcrt1qs758ursh4q9z627kt3pp5yysm78ddny6txaqgw'))) nodes.remove(('crypto:currency:address', ('btc', 'bc1qrp33g0q5c5txsp9arysrx4k6zdkfs4nce4xj0gdcccefvpysxf3qccfmv3'))) nodes.remove(('crypto:currency:address', ('btc', 'tb1qrp33g0q5c5txsp9arysrx4k6zdkfs4nce4xj0gdcccefvpysxf3q0sl5k7'))) nodes.remove(('crypto:currency:address', ('btc', 'tb1qrp33g0q5c5txsp9arysrx4k6zdkfs4nce4xj0gdcccefvpysxf3q0sl5k7'))) nodes.remove(('crypto:currency:address', ('btc', 'bc1pw508d6qejxtdg4y5r3zarvary0c5xw7kw508d6qejxtdg4y5r3zarvary0c5xw7k7grplx'))) self.len(0, nodes) nodes = list(s_scrape.scrape(eth_addresses)) self.len(9, nodes) nodes.remove(('crypto:currency:address', ('eth', '0x001d3f1ef827552ae1114027bd3ecf1f086ba0f9'))) nodes.remove(('crypto:currency:address', ('eth', '0x52908400098527886e0f7030069857d2e4169ee7'))) nodes.remove(('crypto:currency:address', ('eth', '0x8617e340b3d01fa5f11f306f4090fd50e238070d'))) nodes.remove(('crypto:currency:address', ('eth', '0xde709f2102306220921060314715629080e2fb77'))) nodes.remove(('crypto:currency:address', ('eth', '0x27b1fdb04752bbc536007a920d24acb045561c26'))) nodes.remove(('crypto:currency:address', ('eth', '0x5aAeb6053F3E94C9b9A09f33669435E7Ef1BeAed'))) nodes.remove(('crypto:currency:address', ('eth', '0xfB6916095ca1df60bB79Ce92cE3Ea74c37c5d359'))) nodes.remove(('crypto:currency:address', ('eth', '0xdbF03B407c01E7cD3CBea99509d93f8DDDC8C6FB'))) nodes.remove(('crypto:currency:address', ('eth', '0xD1220A0cf47c7B9Be7A2E6BA89F429762e7b9aDb'))) self.len(0, nodes) nodes = list(s_scrape.scrape(bch_addresses)) self.len(3, nodes) nodes.remove(('crypto:currency:address', ('bch', 'bitcoincash:qqeht8vnwag20yv8dvtcrd4ujx09fwxwsqqqw93w88'))) nodes.remove(('crypto:currency:address', ('bch', 'bchtest:pqc3tyspqwn95retv5k3c5w4fdq0cxvv95u36gfk00'))) nodes.remove(('crypto:currency:address', ('bch', 'bitcoincash:qqkv9wr69ry2p9l53lxp635va4h86wv435995w8p2h'))) nodes = list(s_scrape.scrape(xrp_addresses)) self.len(10, nodes) nodes.remove(('crypto:currency:address', ('xrp', 'rG2ZJRab3EGBmpoxUyiF2guB3GoQTwMGEC'))) nodes.remove(('crypto:currency:address', ('xrp', 'rfBKzgkPt9EvSJmk1uhoWTauaFCaRh4jMp'))) nodes.remove(('crypto:currency:address', ('xrp', 'rLUEXYuLiQptky37CqLcm9USQpPiz5rkpD'))) nodes.remove(('crypto:currency:address', ('xrp', 'X7AcgcsBL6XDcUb289X4mJ8djcdyKaB5hJDWMArnXr61cqZ')), ) nodes.remove(('crypto:currency:address', ('xrp', 'rG2ZJRab3EGBmpoxUyiF2guB3GoQTwMGEC'))) nodes.remove(('crypto:currency:address', ('xrp', 'rrrrrrrrrrrrrrrrrrrrrhoLvTp'))) nodes.remove(('crypto:currency:address', ('xrp', 'rrrrrrrrrrrrrrrrrrrrBZbvji'))) nodes.remove(('crypto:currency:address', ('xrp', 'rHb9CJAWyB4rj91VRWn96DkukG4bwdtyTh'))) nodes.remove(('crypto:currency:address', ('xrp', 'rrrrrrrrrrrrrrrrrNAMEtxvNvQ'))) nodes.remove(('crypto:currency:address', ('xrp', 'rrrrrrrrrrrrrrrrrrrn5RM1rHd'))) nodes = list(s_scrape.scrape(substrate_addresses)) self.len(4, nodes) nodes.remove(('crypto:currency:address', ('dot', '12uxb9baJaiHhCvMzijnCYbkiXpGQ24jhj4AmhNvrMEzWuoV'))) nodes.remove(('crypto:currency:address', ('dot', '1FRMM8PEiWXYax7rpS6X4XZX1aAAxSWx1CrKTyrVYhV24fg'))) nodes.remove(('crypto:currency:address', ('ksm', 'JL1eTcbzuZP99FjeySkDrMygNREPdbhRyV7iD5AsV4fDRcg'))) nodes.remove(('crypto:currency:address', ('ksm', 'CpjsLDC1JFyrhm3ftC9Gs4QoyrkHKhZKtK7YqGTRFtTafgp'))) nodes = list(s_scrape.scrape(cardano_addresses)) self.len(7, nodes) nodes.remove(('crypto:currency:address', ('ada', 'Ae2tdPwUPEZFRbyhz3cpfC2CumGzNkFBN2L42rcUc2yjQpEkxDbkPodpMAi'))) nodes.remove(('crypto:currency:address', ('ada', 'Ae2tdPwUPEYzs5BRbGcoS3DXvK8mwgggmESz4HqUwMyaS9eNksZGz1LMS9v'))) nodes.remove(('crypto:currency:address', ('ada', 'Ae2tdPwUPEYxYNJw1He1esdZYvjmr4NtPzUsGTiqL9zd8ohjZYQcwu6kom7'))) nodes.remove(('crypto:currency:address', ('ada', 'DdzFFzCqrhtCNjPk5Lei7E1FxnoqMoAYtJ8VjAWbFmDb614nNBWBwv3kt6QHJa59cGezzf6piMWsbK7sWRB5sv325QqWdRuusMqqLdMt'))) nodes.remove(('crypto:currency:address', ('ada', 'DdzFFzCqrhsfdzUZxvuBkhV8Lpm9p43p9ubh79GCTkxJikAjKh51qhtCFMqUniC5tv5ZExyvSmAte2Du2tGimavSo6qSgXbjiy8qZRTg'))) nodes.remove(('crypto:currency:address', ('ada', 'addr1vpu5vlrf4xkxv2qpwngf6cjhtw542ayty80v8dyr49rf5eg0yu80w'))) nodes.remove(('crypto:currency:address', ('ada', 'addr1v8fet8gavr6elqt6q50skkjf025zthqu6vr56l5k39sp9aqlvz2g4')))
def oper(self): ''' ''' self.ignore(whitespace) if not self.more(): self._raiseSyntaxError('unexpected end of query text') if self.nextstr('{'): return self.subquery() # some syntax elements prior to a prop/oper name... if self.nextstr('->'): return self.formpivot() if self.nextstr('-+>'): return self.formjoin() if self.nextstr('<-'): return self.formpivotin() if self.nextstr('<+-'): return self.formjoinin() if self.nextstr('##'): return self.lifttagtag() char = self.nextchar() # var list assignment # ($foo, $bar) = $baz if char == '(': varl = self.varlist() self.ignore(whitespace) self.nextmust('=') self.ignore(whitespace) valu = self.valu() return s_ast.VarListSetOper(kids=(varl, valu)) # $foo = valu var assignment if char == '$': varn = self.varname() self.ignore(whitespace) self.nextmust('=') self.ignore(whitespace) valu = self.valu() kids = (varn, valu) return s_ast.VarSetOper(kids=kids) if char in ('+', '-'): return self.filtoper() if char == '#': return self.liftbytag() # :foo:bar relative property if char == ':': prop = self.relprop() # :foo=10 here could be assignment... self.ignore(whitespace) if self.nextstr('->'): return self.proppivot(prop) if self.nextstr('-+>'): return self.propjoin(prop) if self.nextstrs('<-', '<+-'): self._raiseSyntaxError('Pivot in syntax does not currently support relative properties.') tokn = self.peek(varset) if tokn == 'for': return self.forloop() if tokn == 'switch': return self.switchcase() if tokn == 'break': self.offs += 5 return s_ast.BreakOper() if tokn == 'continue': self.offs += 8 return s_ast.ContinueOper() noff = self.offs name = self.noms(varset) if not name: self._raiseSyntaxError('unknown query syntax') if self.modelinfo.isprop(name): # before ignoring more whitespace, check for form#tag[=time] if self.nextstr('#'): tag = self.tagname() form = s_ast.Const(name) self.ignore(whitespace) kids = [form, tag] if self.nextchar() in cmprstart: kids.append(self.cmpr()) kids.append(self.valu()) return s_ast.LiftFormTag(kids=kids) self.ignore(whitespace) if self.nextchar() in cmprstart: cmpr = self.cmpr() valu = self.valu() kids = (s_ast.Const(name), cmpr, valu) return s_ast.LiftPropBy(kids=kids) # lift by prop only return s_ast.LiftProp(kids=(s_ast.Const(name),)) if name in self.stormcmds: argv = self.cmdargv() self.ignore(whitespace) # eat a trailing | from a command at the beginning if self.nextstr('|'): self.offs += 1 return s_ast.CmdOper(kids=(s_ast.Const(name), argv)) # rewind and noms until whitespace self.offs = noff tokn = self.noms(until=whitespace) ndefs = list(s_scrape.scrape(tokn)) if ndefs: return s_ast.LiftByScrape(ndefs) self.offs = noff raise s_exc.NoSuchProp(name=name)