class Extractor(unittest.TestCase): def setUp(self): self.candidate=URIExtractor() self.candidate.skiplist=['skipme.com'] def tearDown(self): pass def test_simple_text(self): txt="""hello http://bla.com please click on <a href="www.co.uk">slashdot.org/?a=c&f=m</a> www.skipme.com www.skipmenot.com/ x.co/4to2S http://allinsurancematters.net/lurchwont/ muahahaha x.org""" uris=self.candidate.extracturis(txt) self.assertTrue('http://bla.com' in uris, 'missing http://bla.com from %s, got only %s'%(txt,uris)) self.assertTrue('www.co.uk' in uris) self.assertTrue('slashdot.org/?a=c&f=m' in uris) self.assertTrue('www.skipmenot.com/' in uris) #print " ".join(uris) self.assertTrue("skipme.com" not in " ".join(uris)) self.assertTrue("http://allinsurancematters.net/lurchwont/" in uris) self.assertTrue("x.org" in uris,'rule at the end not found') self.assertTrue('x.co/4to2S','x.co short uri not found') def test_dotquad(self): txt="""click on 1.2.3.4 or http://62.2.17.61/ or https://8.8.8.8/bla.com """ uris=self.candidate.extracturis(txt) self.assertTrue('1.2.3.4' in uris) self.assertTrue('http://62.2.17.61/' in uris) self.assertTrue('https://8.8.8.8/bla.com' in uris) def test_ipv6(self): txt="""click on http://[1337:1558:100b:1337:21b:21ff:fe9d:4e4a]/blah """ uris=self.candidate.extracturis(txt) self.assertTrue('http://[1337:1558:100b:1337:21b:21ff:fe9d:4e4a]/blah' in uris,'ipv6 uri not extracted, got : %s'%uris) def test_uppercase(self): txt="""hello http://BLa.com please click""" uris=self.candidate.extracturis(txt) self.assertTrue('http://bla.com' not in uris,'uris should not be lowercased') self.assertTrue('http://BLa.com' in uris,'uri with uppercase not found') def test_url_without_file(self): txt="""lol http://roasty.familyhealingassist.ru?coil&commission blubb""" uris=self.candidate.extracturis(txt) self.assertTrue('http://roasty.familyhealingassist.ru?coil&commission' in uris,'did not find uri, result was %s'%uris) def test_negative(self): txt=""" yolo-hq.com&n=R3QY1V&c=0VZ1ND 1.2.3.4.5 1.2.3 2fwww.mktcompany.com.br%2forigem%2femail """ uris=self.candidate.extracturis(txt) self.assertTrue(len(uris)==0,"Invalid uris should not have been extracted: %s"%uris) def test_usernamepw(self): txt=""" ftp://yolo:[email protected]/blubb/bloing/baz.zip ftp://[email protected]/blubb/bloing/baz.zip """ uris=self.candidate.extracturis(txt) self.assertTrue('ftp://*****:*****@bla.com/blubb/bloing/baz.zip' in uris,'did not find uri with username and pw. result was %s'%uris) self.assertTrue('ftp://[email protected]/blubb/bloing/baz.zip' in uris,'did not find uri with username. result was %s'%uris) def test_url_with_at(self): txt="""hello http://www.recswangy.com/[email protected]&clt=EH please click""" uris=self.candidate.extracturis(txt) self.assertTrue('http://www.recswangy.com/[email protected]&clt=EH' in uris,'uri with @ character not found') def test_ending_qmark(self): txt="""aaa http://hoostie.com/rescatenews/files/images/dw_logo.png? bbb""" uris=self.candidate.extracturis(txt) self.assertTrue('http://hoostie.com/rescatenews/files/images/dw_logo.png?' in uris,'uri with ending ? not found') def test_url_with_bracket(self): txt="""hello http://phohoanglong.com/]Spyware please click""" uris=self.candidate.extracturis(txt) self.assertTrue('http://phohoanglong.com/]Spyware' in uris,'uri with ] character in path not found') def test_url_with_tilde(self): txt="""http://vanwinkle.de/NEW.IMPORTANT-NATWEST~BANKLINE-FORM/new_bankline.html please click""" uris=self.candidate.extracturis(txt) self.assertTrue('http://vanwinkle.de/NEW.IMPORTANT-NATWEST~BANKLINE-FORM/new_bankline.html' in uris,'uri with ~ character in path not found') def test_url_after_parentheses(self): txt=")http://vhyue.com/gbn3q/jahy6?id=8071100&pass=EmxUo4ST&mid=498270380&m=detail" uris=self.candidate.extracturis(txt) self.assertTrue('http://vhyue.com/gbn3q/jahy6?id=8071100&pass=EmxUo4ST&mid=498270380&m=detail' in uris,'uri after closing parentheses not found') def test_url_with_port(self): txt=" http://www.ironchampusa.ru:8177/247emaillists/ " uris=self.candidate.extracturis(txt) self.assertTrue('http://www.ironchampusa.ru:8177/247emaillists/' in uris,'uri with port not found') def test_fqdn_from_uri(self): self.assertEquals(fqdn_from_uri('http://www.ironchampusa.ru:8177/247emaillists/') ,'www.ironchampusa.ru') def test_url_with_leading_crap(self): txt=" ��*http://f5399r5hxs.com/epPgyPk/yYluS3/LPjyRhr/SlqRhe/YeuVlrX/maSsBVk/BiRJU " uris=self.candidate.extracturis(txt) self.assertTrue('http://f5399r5hxs.com/epPgyPk/yYluS3/LPjyRhr/SlqRhe/YeuVlrX/maSsBVk/BiRJU' in uris,'uri with leading crap chars not found')
class URIExtract(ScannerPlugin): """Extract URIs from message bodies and store them as list in tag body.uris""" def __init__(self, config, section=None): ScannerPlugin.__init__(self, config, section) self.logger = self._logger() self.extractor = None self.htmlparser = HTMLParser() self.requiredvars = { 'domainskiplist': { 'default': '/etc/fuglu/extract-skip-domains.txt', 'description': 'Domain skip list', }, 'maxsize': { 'default': '10485000', 'description': 'Maximum size of processed mails. Larger mail will be skipped.', }, 'loguris': { 'default': 'no', 'description': 'print extracted uris in fuglu log', }, } def _prepare(self): if self.extractor is None: self.extractor = URIExtractor() skiplist = self.config.get(self.section, 'domainskiplist') if skiplist != '': self.extractor.load_skiplist(skiplist) def _run(self, suspect): if not DOMAINMAGIC_AVAILABLE: self.logger.info('Not scanning - Domainmagic not available') return DUNNO maxsize = self.config.getint(self.section, 'maxsize') if suspect.size > maxsize: self.logger.info( 'Not scanning - message too big (message %s bytes > config %s bytes )' % (suspect.size, maxsize)) return DUNNO self._prepare() uris = [] for content in self.get_decoded_textparts(suspect): try: parturis = self.extractor.extracturis(content) uris.extend(parturis) except Exception as e: self.logger.error( '%s failed to extract URIs from msg part: %s' % (suspect.id, str(e))) if self.config.getboolean(self.section, 'loguris'): self.logger.info('%s Extracted URIs: %s' % (suspect.id, uris)) suspect.set_tag('body.uris', uris) return DUNNO def process(self, suspect, decision): self._run(suspect) def examine(self, suspect): return self._run(suspect) def get_decoded_textparts(self, suspect, bcompatible=True): """bcompatible True will work with FUGLU version before implementation of attachment manager in Suspect """ textparts = [] try: att_mgr = suspect.att_mgr except AttributeError: message = 'This version of URIextract is supposed to use a FUGLU version with Attachment Manager. \n' \ 'Please update your FUGLU version' if bcompatible: self.logger.warning(message) else: raise AttributeError(message) return self.get_decoded_textparts_deprecated(suspect) for attObj in att_mgr.get_objectlist(): if attObj.content_fname_check(contenttype_start="text/") \ or attObj.content_fname_check(name_end=(".txt", ".html", ".htm")): decoded_payload = attObj.decoded_buffer_text if attObj.content_fname_check(contenttype_contains="html") \ or attObj.content_fname_check(name_contains=".htm"): decoded_payload = decoded_payload.replace(u'\n', u'').replace( u'\r', u'') try: decoded_payload = self.htmlparser.unescape(decoded_payload) except Exception: self.logger.debug('%s failed to unescape html entities' % suspect.id) textparts.append(decoded_payload) if attObj.content_fname_check(contenttype="multipart/alternative"): textparts.append(attObj.decoded_buffer_text) return textparts def get_decoded_textparts_deprecated(self, suspect): """Returns a list of all text contents""" messagerep = suspect.get_message_rep() textparts = [] for part in messagerep.walk(): if part.is_multipart(): continue fname = part.get_filename(None) if fname is None: fname = "" fname = fname.lower() contenttype = part.get_content_type() if contenttype.startswith('text/') or fname.endswith( ".txt") or fname.endswith(".html") or fname.endswith( ".htm"): payload = part.get_payload(None, True) if payload is not None: # Try to decode using the given char set (or utf-8 by default) charset = part.get_content_charset("utf-8") payload = force_uString(payload, encodingGuess=charset) if 'html' in contenttype or '.htm' in fname: #remove newlines from html so we get uris spanning multiple lines payload = payload.replace('\n', '').replace('\r', '') try: payload = self.htmlparser.unescape(payload) except Exception: self.logger.debug('%s failed to unescape html entities' % suspect.id) textparts.append(payload) if contenttype == 'multipart/alternative': try: payload = part.get_payload(None, True) if payload is not None: # Try to decode using the given char set charset = part.get_content_charset("utf-8") text = force_uString(payload, encodingGuess=charset) textparts.append(text) except (UnicodeEncodeError, UnicodeDecodeError): self.logger.debug( '%s failed to convert alternative part to string' % suspect.id) return textparts def lint(self): allok = True if not DOMAINMAGIC_AVAILABLE: print( "ERROR: domainmagic lib or one of its dependencies (dnspython/pygeoip) is not installed!" ) allok = False if allok: allok = self.check_config() return allok