Python URIExtractor.extracturis 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: domainmagic.extractor

클래스/타입: URIExtractor

메소드/함수: extracturis

hotexamples.com에서의 예제들: 2

Python URIExtractor.extracturis - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 domainmagic.extractor.URIExtractor.extracturis에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

URIExtractor(2)

extracturis(2)

load_skiplist(1)

예제 #1

파일 보기

파일: extractor_test.py 프로젝트: heikipikker/domainmagic

class Extractor(unittest.TestCase):
    def setUp(self):
        self.candidate=URIExtractor()
        self.candidate.skiplist=['skipme.com']
    
    def tearDown(self):
        pass
    
    
    def test_simple_text(self):
        txt="""hello http://bla.com please click on <a href="www.co.uk">slashdot.org/?a=c&f=m</a> www.skipme.com www.skipmenot.com/ x.co/4to2S http://allinsurancematters.net/lurchwont/ muahahaha x.org"""
        
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://bla.com' in uris, 'missing http://bla.com from %s, got only %s'%(txt,uris))
        self.assertTrue('www.co.uk' in uris)
        self.assertTrue('slashdot.org/?a=c&f=m' in uris)
        
        self.assertTrue('www.skipmenot.com/' in uris)
        #print " ".join(uris)
        self.assertTrue("skipme.com" not in " ".join(uris))
        
        self.assertTrue("http://allinsurancematters.net/lurchwont/" in uris)
        self.assertTrue("x.org" in uris,'rule at the end not found')
        self.assertTrue('x.co/4to2S','x.co short uri not found')
        
        
    def test_dotquad(self):
        txt="""click on 1.2.3.4 or http://62.2.17.61/ or https://8.8.8.8/bla.com """
        
        uris=self.candidate.extracturis(txt)
        self.assertTrue('1.2.3.4' in uris)
        self.assertTrue('http://62.2.17.61/' in uris)
        self.assertTrue('https://8.8.8.8/bla.com' in uris)
        
    def test_ipv6(self):
        txt="""click on http://[1337:1558:100b:1337:21b:21ff:fe9d:4e4a]/blah """
        
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://[1337:1558:100b:1337:21b:21ff:fe9d:4e4a]/blah' in uris,'ipv6 uri not extracted, got : %s'%uris)

        
    def test_uppercase(self):
        txt="""hello http://BLa.com please click"""
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://bla.com' not in uris,'uris should not be lowercased')
        self.assertTrue('http://BLa.com' in uris,'uri with uppercase not found')
        
    def test_url_without_file(self):
        txt="""lol http://roasty.familyhealingassist.ru?coil&commission blubb"""
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://roasty.familyhealingassist.ru?coil&commission' in uris,'did not find uri, result was %s'%uris)
        
    def test_negative(self):
        txt=""" yolo-hq.com&n=R3QY1V&c=0VZ1ND 1.2.3.4.5 1.2.3 2fwww.mktcompany.com.br%2forigem%2femail """
        uris=self.candidate.extracturis(txt)
        self.assertTrue(len(uris)==0,"Invalid uris should not have been extracted: %s"%uris)
        
    def test_usernamepw(self):
        txt=""" ftp://yolo:[email protected]/blubb/bloing/baz.zip ftp://[email protected]/blubb/bloing/baz.zip """
        uris=self.candidate.extracturis(txt)
        self.assertTrue('ftp://*****:*****@bla.com/blubb/bloing/baz.zip' in uris,'did not find uri with username and pw. result was %s'%uris)
        self.assertTrue('ftp://[email protected]/blubb/bloing/baz.zip' in uris,'did not find uri with username. result was %s'%uris)
    
    
    def test_url_with_at(self):
        txt="""hello http://www.recswangy.com/[email protected]&clt=EH please click"""
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://www.recswangy.com/[email protected]&clt=EH' in uris,'uri with @ character not found')
        
    def test_ending_qmark(self):
        txt="""aaa http://hoostie.com/rescatenews/files/images/dw_logo.png?  bbb"""
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://hoostie.com/rescatenews/files/images/dw_logo.png?' in uris,'uri with ending ? not found')

    def test_url_with_bracket(self):
        txt="""hello http://phohoanglong.com/]Spyware please click"""
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://phohoanglong.com/]Spyware' in uris,'uri with ] character in path not found')

    def test_url_with_tilde(self):
        txt="""http://vanwinkle.de/NEW.IMPORTANT-NATWEST~BANKLINE-FORM/new_bankline.html please click"""
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://vanwinkle.de/NEW.IMPORTANT-NATWEST~BANKLINE-FORM/new_bankline.html' in uris,'uri with ~ character in path not found')


    def test_url_after_parentheses(self):
        txt=")http://vhyue.com/gbn3q/jahy6?id=8071100&pass=EmxUo4ST&mid=498270380&m=detail"
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://vhyue.com/gbn3q/jahy6?id=8071100&pass=EmxUo4ST&mid=498270380&m=detail' in uris,'uri after closing parentheses not found')

    def test_url_with_port(self):
        txt=" http://www.ironchampusa.ru:8177/247emaillists/ "
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://www.ironchampusa.ru:8177/247emaillists/' in uris,'uri with port not found')


    def test_fqdn_from_uri(self):
        self.assertEquals(fqdn_from_uri('http://www.ironchampusa.ru:8177/247emaillists/') ,'www.ironchampusa.ru')

    def test_url_with_leading_crap(self):
        txt="   ��*http://f5399r5hxs.com/epPgyPk/yYluS3/LPjyRhr/SlqRhe/YeuVlrX/maSsBVk/BiRJU "
        uris=self.candidate.extracturis(txt)
        self.assertTrue('http://f5399r5hxs.com/epPgyPk/yYluS3/LPjyRhr/SlqRhe/YeuVlrX/maSsBVk/BiRJU' in uris,'uri with leading crap chars not found')

예제 #2

파일 보기

class URIExtract(ScannerPlugin):
    """Extract URIs from message bodies and store them as list in tag body.uris"""
    def __init__(self, config, section=None):
        ScannerPlugin.__init__(self, config, section)
        self.logger = self._logger()
        self.extractor = None
        self.htmlparser = HTMLParser()

        self.requiredvars = {
            'domainskiplist': {
                'default': '/etc/fuglu/extract-skip-domains.txt',
                'description': 'Domain skip list',
            },
            'maxsize': {
                'default':
                '10485000',
                'description':
                'Maximum size of processed mails. Larger mail will be skipped.',
            },
            'loguris': {
                'default': 'no',
                'description': 'print extracted uris in fuglu log',
            },
        }

    def _prepare(self):
        if self.extractor is None:
            self.extractor = URIExtractor()
            skiplist = self.config.get(self.section, 'domainskiplist')
            if skiplist != '':
                self.extractor.load_skiplist(skiplist)

    def _run(self, suspect):
        if not DOMAINMAGIC_AVAILABLE:
            self.logger.info('Not scanning - Domainmagic not available')
            return DUNNO

        maxsize = self.config.getint(self.section, 'maxsize')
        if suspect.size > maxsize:
            self.logger.info(
                'Not scanning - message too big (message %s  bytes > config %s bytes )'
                % (suspect.size, maxsize))
            return DUNNO

        self._prepare()

        uris = []
        for content in self.get_decoded_textparts(suspect):
            try:
                parturis = self.extractor.extracturis(content)
                uris.extend(parturis)
            except Exception as e:
                self.logger.error(
                    '%s failed to extract URIs from msg part: %s' %
                    (suspect.id, str(e)))

        if self.config.getboolean(self.section, 'loguris'):
            self.logger.info('%s Extracted URIs: %s' % (suspect.id, uris))
        suspect.set_tag('body.uris', uris)
        return DUNNO

    def process(self, suspect, decision):
        self._run(suspect)

    def examine(self, suspect):
        return self._run(suspect)

    def get_decoded_textparts(self, suspect, bcompatible=True):
        """bcompatible True will work with FUGLU version before implementation of attachment manager in Suspect """
        textparts = []

        try:
            att_mgr = suspect.att_mgr
        except AttributeError:
            message = 'This version of URIextract is supposed to use a FUGLU version with Attachment Manager. \n' \
                      'Please update your FUGLU version'
            if bcompatible:
                self.logger.warning(message)
            else:
                raise AttributeError(message)
            return self.get_decoded_textparts_deprecated(suspect)

        for attObj in att_mgr.get_objectlist():
            if attObj.content_fname_check(contenttype_start="text/") \
                    or attObj.content_fname_check(name_end=(".txt", ".html", ".htm")):
                decoded_payload = attObj.decoded_buffer_text

                if attObj.content_fname_check(contenttype_contains="html") \
                        or attObj.content_fname_check(name_contains=".htm"):
                    decoded_payload = decoded_payload.replace(u'\n',
                                                              u'').replace(
                                                                  u'\r', u'')

                try:
                    decoded_payload = self.htmlparser.unescape(decoded_payload)
                except Exception:
                    self.logger.debug('%s failed to unescape html entities' %
                                      suspect.id)

                textparts.append(decoded_payload)

            if attObj.content_fname_check(contenttype="multipart/alternative"):
                textparts.append(attObj.decoded_buffer_text)
        return textparts

    def get_decoded_textparts_deprecated(self, suspect):
        """Returns a list of all text contents"""
        messagerep = suspect.get_message_rep()

        textparts = []
        for part in messagerep.walk():
            if part.is_multipart():
                continue
            fname = part.get_filename(None)
            if fname is None:
                fname = ""
            fname = fname.lower()
            contenttype = part.get_content_type()

            if contenttype.startswith('text/') or fname.endswith(
                    ".txt") or fname.endswith(".html") or fname.endswith(
                        ".htm"):
                payload = part.get_payload(None, True)
                if payload is not None:
                    # Try to decode using the given char set (or utf-8 by default)
                    charset = part.get_content_charset("utf-8")
                    payload = force_uString(payload, encodingGuess=charset)

                if 'html' in contenttype or '.htm' in fname:  #remove newlines from html so we get uris spanning multiple lines
                    payload = payload.replace('\n', '').replace('\r', '')
                try:
                    payload = self.htmlparser.unescape(payload)
                except Exception:
                    self.logger.debug('%s failed to unescape html entities' %
                                      suspect.id)
                textparts.append(payload)

            if contenttype == 'multipart/alternative':
                try:
                    payload = part.get_payload(None, True)

                    if payload is not None:
                        # Try to decode using the given char set
                        charset = part.get_content_charset("utf-8")
                        text = force_uString(payload, encodingGuess=charset)

                    textparts.append(text)
                except (UnicodeEncodeError, UnicodeDecodeError):
                    self.logger.debug(
                        '%s failed to convert alternative part to string' %
                        suspect.id)

        return textparts

    def lint(self):
        allok = True
        if not DOMAINMAGIC_AVAILABLE:
            print(
                "ERROR: domainmagic lib or one of its dependencies (dnspython/pygeoip) is not installed!"
            )
            allok = False

        if allok:
            allok = self.check_config()

        return allok