예제 #1
0
def get_urls(url, depth=1):
    if depth > 5:
        print('Too many redirects.')
        return
    fex = Faup()

    def meta_redirect(content):
        c = content.lower()
        soup = BeautifulSoup(c)
        for result in soup.find_all(attrs={'http-equiv': 'refresh'}):
            if result:
                out = result["content"].split(";")
                if len(out) == 2:
                    wait, text = out
                    a, url = text.split('=', 1)
                    return url.strip()
        return None

    resolve, reason = try_resolve(fex, url)
    if not resolve:
        # FIXME: inform that the domain does not resolve
        yield url
        return

    logging.debug("Making HTTP connection to " + url)

    headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0'}
    try:
        response = requests.get(url, allow_redirects=True, headers=headers,
                                timeout=15, verify=False)
    except:
        # That one can fail (DNS for example)
        # FIXME: inform that the get failed
        yield url
        return
    if response.history is not None:
        for h in response.history:
            # Yeld the urls in the order we find them
            yield h.url

    yield response.url

    meta_redir_url = meta_redirect(response.content)
    if meta_redir_url is not None:
        depth += 1
        if not meta_redir_url.startswith('http'):
            fex.decode(url)
            base = '{}://{}'.format(fex.get_scheme(), fex.get_host())
            port = fex.get_port()
            if port is not None:
                base += ':{}'.format(port)
            if not meta_redir_url.startswith('/'):
                # relative redirect. resource_path has the initial '/'
                if fex.get_resource_path() is not None:
                    base += fex.get_resource_path()
            if not base.endswith('/'):
                base += '/'
            meta_redir_url = base + meta_redir_url
        for url in get_urls(meta_redir_url, depth):
            yield url
예제 #2
0
def get_urls(url, depth=1):
    if depth > 5:
        print('Too many redirects.')
        return
    fex = Faup()

    def meta_redirect(content):
        c = content.lower()
        soup = BeautifulSoup(c, "html.parser")
        for result in soup.find_all(attrs={'http-equiv': 'refresh'}):
            if result:
                out = result["content"].split(";")
                if len(out) == 2:
                    wait, text = out
                    a, url = text.split('=', 1)
                    return url.strip()
        return None

    resolve, reason = try_resolve(fex, url)
    if not resolve:
        # FIXME: inform that the domain does not resolve
        yield url
        return

    logging.debug("Making HTTP connection to " + url)

    headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0'}
    try:
        response = requests.get(url, allow_redirects=True, headers=headers,
                                timeout=15, verify=False)
    except:
        # That one can fail (DNS for example)
        # FIXME: inform that the get failed
        yield url
        return
    if response.history is not None:
        for h in response.history:
            # Yeld the urls in the order we find them
            yield h.url

    yield response.url

    meta_redir_url = meta_redirect(response.content)
    if meta_redir_url is not None:
        depth += 1
        if not meta_redir_url.startswith('http'):
            fex.decode(url)
            base = '{}://{}'.format(fex.get_scheme(), fex.get_host())
            port = fex.get_port()
            if port is not None:
                base += ':{}'.format(port)
            if not meta_redir_url.startswith('/'):
                # relative redirect. resource_path has the initial '/'
                if fex.get_resource_path() is not None:
                    base += fex.get_resource_path()
            if not base.endswith('/'):
                base += '/'
            meta_redir_url = base + meta_redir_url
        for url in get_urls(meta_redir_url, depth):
            yield url
예제 #3
0
    def __post_init__(self):
        f = Faup(
        )  # Example code at https://programtalk.com/python-examples-amp/pyfaup.faup.Faup/
        f.decode(self.url)

        self.scheme = f.get_scheme()
        self.top_level_domain = f.get_tld()
        self.domain = f.get_domain()
        self.subdomain = f.get_subdomain()
        self.path = f.get_resource_path()
예제 #4
0
    def sort(self, elem_links, url):
        fex = Faup()
        f = Filters()
        f.load()
        self.r.switchDB(1)
        extend = True
        domainfilter = True
        schemefilter = True
        try:
            for link in elem_links:
                new_url = link
                self.r.switchDB(2)
                if not self.r.get(new_url) and new_url:
                    self.r.switchDB(1)
                    if not self.r.get(new_url):
                        fex.decode(new_url)
                        domain = fex.get_host()
                        if f.isfilteredscheme(fex.get_scheme()):
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)
                            schemefilter = False
                        if f.isfiltereddomains(domain):
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)
                            domainfilter = False
                        if f.isfilteredextention(fex.get_resource_path()):
                            extend = False
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)

                        if extend and domainfilter and schemefilter:
                            self.r.switchDB(1)
                            self.r.rpush('crawl', new_url)
                            self.queue.append(new_url)
        except TypeError as e:
            print "TypeError"
예제 #5
0
파일: crawler.py 프로젝트: 5l1v3r1/OSINT-1
    def sort(self, elem_links, url):
        fex = Faup()
        f = Filters()
        f.load()
        self.r.switchDB(1)
        extend = True
        domainfilter = True
        schemefilter = True
        try:
            for link in elem_links:
                new_url = link
                self.r.switchDB(2)
                if not self.r.get(new_url) and new_url:
                    self.r.switchDB(1)
                    if not self.r.get(new_url):
                        fex.decode(new_url)
                        domain = fex.get_host()
                        if f.isfilteredscheme(fex.get_scheme()):
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)
                            schemefilter = False
                        if f.isfiltereddomains(domain):
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)
                            domainfilter = False
                        if f.isfilteredextention(fex.get_resource_path()):
                            extend = False
                            self.r.switchDB(2)
                            self.r.put(new_url, new_url)

                        if extend and domainfilter and schemefilter:
                            self.r.switchDB(1)
                            self.r.rpush('crawl', new_url)
                            self.queue.append(new_url)
        except TypeError as e:
            print "TypeError"
예제 #6
0
class Mail2MISP():
    def __init__(self,
                 misp_url,
                 misp_key,
                 verifycert,
                 config,
                 offline=False,
                 urlsonly=False):
        self.offline = offline
        if not self.offline:
            self.misp = ExpandedPyMISP(misp_url,
                                       misp_key,
                                       verifycert,
                                       debug=config.debug)
        self.config = config
        self.urlsonly = urlsonly
        if not hasattr(self.config, 'enable_dns'):
            setattr(self.config, 'enable_dns', True)
        if self.urlsonly is False:
            setattr(self.config, 'enable_dns', False)
        self.debug = self.config.debug
        self.config_from_email_body = {}
        # Init Faup
        self.f = Faup()
        self.sightings_to_add = []

    def load_email(self, pseudofile):
        self.pseudofile = pseudofile
        self.original_mail = message_from_bytes(self.pseudofile.getvalue(),
                                                policy=policy.default)
        self.subject = self.original_mail.get('Subject')
        try:
            self.sender = self.original_mail.get('From')
        except:
            self.sender = "<unknown sender>"

        # Remove words from subject
        for removeword in self.config.removelist:
            self.subject = re.sub(removeword, "", self.subject).strip()

        # Initialize the MISP event
        self.misp_event = MISPEvent()
        self.misp_event.info = f'{self.config.email_subject_prefix} - {self.subject}'
        self.misp_event.distribution = self.config.default_distribution
        self.misp_event.threat_level_id = self.config.default_threat_level
        self.misp_event.analysis = self.config.default_analysis

    def sighting(self, value, source):
        if self.offline:
            raise Exception('The script is running in offline mode, ')
        '''Add a sighting'''
        s = MISPSighting()
        s.from_dict(value=value, source=source)
        self.misp.add_sighting(s)

    def _find_inline_forward(self):
        '''Does the body contains a forwarded email?'''
        for identifier in self.config.forward_identifiers:
            if identifier in self.clean_email_body:
                self.clean_email_body, fw_email = self.clean_email_body.split(
                    identifier)
                return self.forwarded_email(
                    pseudofile=BytesIO(fw_email.encode()))

    def _find_attached_forward(self):
        forwarded_emails = []
        for attachment in self.original_mail.iter_attachments():
            attachment_content = attachment.get_content()
            # Search for email forwarded as attachment
            # I could have more than one, attaching everything.
            if isinstance(attachment_content, message.EmailMessage):
                forwarded_emails.append(
                    self.forwarded_email(
                        pseudofile=BytesIO(attachment_content.as_bytes())))
            else:
                if isinstance(attachment_content, str):
                    attachment_content = attachment_content.encode()
                filename = attachment.get_filename()
                if not filename:
                    filename = 'missing_filename'
                if self.config_from_email_body.get(
                        'attachment'
                ) == self.config.m2m_benign_attachment_keyword:
                    # Attach sane file
                    self.misp_event.add_attribute(
                        'attachment',
                        value=filename,
                        data=BytesIO(attachment_content))
                else:
                    f_object, main_object, sections = make_binary_objects(
                        pseudofile=BytesIO(attachment_content),
                        filename=filename,
                        standalone=False)
                    self.misp_event.add_object(f_object)
                    if main_object:
                        self.misp_event.add_object(main_object)
                        [
                            self.misp_event.add_object(section)
                            for section in sections
                        ]
        return forwarded_emails

    def email_from_spamtrap(self):
        '''The email comes from a spamtrap and should be attached as-is.'''
        raw_body = self.original_mail.get_body(preferencelist=('html',
                                                               'plain'))
        if raw_body:
            self.clean_email_body = html.unescape(
                raw_body.get_payload(decode=True).decode(
                    'utf8', 'surrogateescape'))
        else:
            self.clean_email_body = ''
        return self.forwarded_email(self.pseudofile)

    def forwarded_email(self, pseudofile: BytesIO):
        '''Extracts all possible indicators out of an email and create a MISP event out of it.
        * Gets all relevant Headers
        * Attach the body
        * Create MISP file objects (uses lief if possible)
        * Set all references
        '''
        email_object = EMailObject(pseudofile=pseudofile,
                                   attach_original_mail=True,
                                   standalone=False)
        if email_object.attachments:
            # Create file objects for the attachments
            for attachment_name, attachment in email_object.attachments:
                if not attachment_name:
                    attachment_name = 'NameMissing.txt'
                if self.config_from_email_body.get(
                        'attachment'
                ) == self.config.m2m_benign_attachment_keyword:
                    a = self.misp_event.add_attribute('attachment',
                                                      value=attachment_name,
                                                      data=attachment)
                    email_object.add_reference(a.uuid, 'related-to',
                                               'Email attachment')
                else:
                    f_object, main_object, sections = make_binary_objects(
                        pseudofile=attachment,
                        filename=attachment_name,
                        standalone=False)
                    if self.config.vt_key:
                        try:
                            vt_object = VTReportObject(
                                self.config.vt_key,
                                f_object.get_attributes_by_relation(
                                    'sha256')[0].value,
                                standalone=False)
                            self.misp_event.add_object(vt_object)
                            f_object.add_reference(vt_object.uuid,
                                                   'analysed-with')
                        except InvalidMISPObject as e:
                            print(e)
                            pass
                    self.misp_event.add_object(f_object)
                    if main_object:
                        self.misp_event.add_object(main_object)
                        for section in sections:
                            self.misp_event.add_object(section)
                    email_object.add_reference(f_object.uuid, 'related-to',
                                               'Email attachment')
        self.process_body_iocs(email_object)
        if self.config.spamtrap or self.config.attach_original_mail or self.config_from_email_body.get(
                'attach_original_mail'):
            self.misp_event.add_object(email_object)
        return email_object

    def process_email_body(self):
        mail_as_bytes = self.original_mail.get_body(
            preferencelist=('html', 'plain')).get_payload(decode=True)
        if mail_as_bytes:
            self.clean_email_body = html.unescape(
                mail_as_bytes.decode('utf8', 'surrogateescape'))
            # Check if there are config lines in the body & convert them to a python dictionary:
            #   <config.body_config_prefix>:<key>:<value> => {<key>: <value>}
            self.config_from_email_body = {
                k.strip(): v.strip()
                for k, v in re.findall(
                    f'{self.config.body_config_prefix}:(.*):(.*)',
                    self.clean_email_body)
            }
            if self.config_from_email_body:
                # ... remove the config lines from the body
                self.clean_email_body = re.sub(
                    rf'^{self.config.body_config_prefix}.*\n?',
                    '',
                    html.unescape(
                        self.original_mail.get_body(
                            preferencelist=('html', 'plain')).get_payload(
                                decode=True).decode('utf8',
                                                    'surrogateescape')),
                    flags=re.MULTILINE)
            # Check if autopublish key is present and valid
            if self.config_from_email_body.get(
                    'm2mkey') == self.config.m2m_key:
                if self.config_from_email_body.get('distribution') is not None:
                    self.misp_event.distribution = self.config_from_email_body.get(
                        'distribution')
                if self.config_from_email_body.get('threat_level') is not None:
                    self.misp_event.threat_level_id = self.config_from_email_body.get(
                        'threat_level')
                if self.config_from_email_body.get('analysis') is not None:
                    self.misp_event.analysis = self.config_from_email_body.get(
                        'analysis')
                if self.config_from_email_body.get('publish'):
                    self.misp_event.publish()

            self._find_inline_forward()
        else:
            self.clean_email_body = ''
        self._find_attached_forward()

    def process_body_iocs(self, email_object=None):
        if email_object:
            body = html.unescape(
                email_object.email.get_body(
                    preferencelist=('html',
                                    'plain')).get_payload(decode=True).decode(
                                        'utf8', 'surrogateescape'))
        else:
            body = self.clean_email_body

        # Cleanup body content
        # Depending on the source of the mail, there is some cleanup to do. Ignore lines in body of message
        for ignoreline in self.config.ignorelist:
            body = re.sub(rf'^{ignoreline}.*\n?', '', body, flags=re.MULTILINE)

        # Remove everything after the stopword from the body
        body = body.split(self.config.stopword, 1)[0]

        # Add tags to the event if keywords are found in the mail
        for tag in self.config.tlptags:
            for alternativetag in self.config.tlptags[tag]:
                if alternativetag in body.lower():
                    self.misp_event.add_tag(tag)

        # Prepare extraction of IOCs
        # Refang email data
        body = refang(body)

        # Extract and add hashes
        contains_hash = False
        for h in set(re.findall(hashmarker.MD5_REGEX, body)):
            contains_hash = True
            attribute = self.misp_event.add_attribute(
                'md5', h, enforceWarninglist=self.config.enforcewarninglist)
            if email_object:
                email_object.add_reference(attribute.uuid, 'contains')
            if self.config.sighting:
                self.sightings_to_add.append((h, self.config.sighting_source))
        for h in set(re.findall(hashmarker.SHA1_REGEX, body)):
            contains_hash = True
            attribute = self.misp_event.add_attribute(
                'sha1', h, enforceWarninglist=self.config.enforcewarninglist)
            if email_object:
                email_object.add_reference(attribute.uuid, 'contains')
            if self.config.sighting:
                self.sightings_to_add.append((h, self.config.sighting_source))
        for h in set(re.findall(hashmarker.SHA256_REGEX, body)):
            contains_hash = True
            attribute = self.misp_event.add_attribute(
                'sha256', h, enforceWarninglist=self.config.enforcewarninglist)
            if email_object:
                email_object.add_reference(attribute.uuid, 'contains')
            if self.config.sighting:
                self.sightings_to_add.append((h, self.config.sighting_source))

        if contains_hash:
            [
                self.misp_event.add_tag(tag)
                for tag in self.config.hash_only_tags
            ]

        # # Extract network IOCs
        urllist = []
        urllist += re.findall(urlmarker.WEB_URL_REGEX, body)
        urllist += re.findall(urlmarker.IP_REGEX, body)
        if self.debug:
            syslog.syslog(str(urllist))

        hostname_processed = []

        # Add IOCs and expanded information to MISP
        for entry in set(urllist):
            ids_flag = True
            self.f.decode(entry)

            domainname = self.f.get_domain()
            if domainname in self.config.excludelist:
                # Ignore the entry
                continue

            hostname = self.f.get_host()

            scheme = self.f.get_scheme()
            if scheme:
                scheme = scheme

            resource_path = self.f.get_resource_path()
            if resource_path:
                resource_path = resource_path

            if self.debug:
                syslog.syslog(domainname)

            if domainname in self.config.internallist and self.urlsonly is False:  # Add link to internal reference unless in urlsonly mode
                attribute = self.misp_event.add_attribute(
                    'link',
                    entry,
                    category='Internal reference',
                    to_ids=False,
                    enforceWarninglist=False)
                if email_object:
                    email_object.add_reference(attribute.uuid, 'contains')
            elif domainname in self.config.externallist or self.urlsonly is False:  # External analysis
                attribute = self.misp_event.add_attribute(
                    'link',
                    entry,
                    category='External analysis',
                    to_ids=False,
                    enforceWarninglist=False)
                if email_object:
                    email_object.add_reference(attribute.uuid, 'contains')
            elif domainname in self.config.externallist or self.urlsonly:  # External analysis
                if self.urlsonly:
                    comment = self.subject + " (from: " + self.sender + ")"
                else:
                    comment = ""
                attribute = self.misp.add_attribute(
                    self.urlsonly, {
                        "type": 'link',
                        "value": entry,
                        "category": 'External analysis',
                        "to_ids": False,
                        "comment": comment
                    })
                for tag in self.config.tlptags:
                    for alternativetag in self.config.tlptags[tag]:
                        if alternativetag in self.subject.lower():
                            self.misp.tag(attribute["uuid"], tag)
                            new_subject = comment.replace(alternativetag, '')
                            self.misp.change_comment(attribute["uuid"],
                                                     new_subject)

            else:  # The URL is probably an indicator.
                comment = ""
                if (domainname in self.config.noidsflaglist) or (
                        hostname in self.config.noidsflaglist):
                    ids_flag = False
                    comment = "Known host (mostly for connectivity test or IP lookup)"
                if self.debug:
                    syslog.syslog(str(entry))

                if scheme:
                    if is_ip(hostname):
                        attribute = self.misp_event.add_attribute(
                            'url',
                            entry,
                            to_ids=False,
                            enforceWarninglist=self.config.enforcewarninglist)
                        if email_object:
                            email_object.add_reference(attribute.uuid,
                                                       'contains')
                    else:
                        if resource_path:  # URL has path, ignore warning list
                            attribute = self.misp_event.add_attribute(
                                'url',
                                entry,
                                to_ids=ids_flag,
                                enforceWarninglist=False,
                                comment=comment)
                            if email_object:
                                email_object.add_reference(
                                    attribute.uuid, 'contains')
                        else:  # URL has no path
                            attribute = self.misp_event.add_attribute(
                                'url',
                                entry,
                                to_ids=ids_flag,
                                enforceWarninglist=self.config.
                                enforcewarninglist,
                                comment=comment)
                            if email_object:
                                email_object.add_reference(
                                    attribute.uuid, 'contains')
                    if self.config.sighting:
                        self.sightings_to_add.append(
                            (entry, self.config.sighting_source))

                if hostname in hostname_processed:
                    # Hostname already processed.
                    continue

                hostname_processed.append(hostname)
                if self.config.sighting:
                    self.sightings_to_add.append(
                        (hostname, self.config.sighting_source))

                if self.debug:
                    syslog.syslog(hostname)

                comment = ''
                port = self.f.get_port()
                if port:
                    port = port
                    comment = f'on port: {port}'

                if is_ip(hostname):
                    attribute = self.misp_event.add_attribute(
                        'ip-dst',
                        hostname,
                        to_ids=ids_flag,
                        enforceWarninglist=self.config.enforcewarninglist,
                        comment=comment)
                    if email_object:
                        email_object.add_reference(attribute.uuid, 'contains')
                else:
                    related_ips = []
                    if HAS_DNS and self.config.enable_dns:
                        try:
                            syslog.syslog(hostname)
                            for rdata in dns.resolver.query(hostname, 'A'):
                                if self.debug:
                                    syslog.syslog(str(rdata))
                                related_ips.append(rdata.to_text())
                        except Exception as e:
                            if self.debug:
                                syslog.syslog(str(e))

                    if related_ips:
                        hip = MISPObject(name='ip-port')
                        hip.add_attribute(
                            'hostname',
                            value=hostname,
                            to_ids=ids_flag,
                            enforceWarninglist=self.config.enforcewarninglist,
                            comment=comment)
                        for ip in set(related_ips):
                            hip.add_attribute('ip',
                                              type='ip-dst',
                                              value=ip,
                                              to_ids=False,
                                              enforceWarninglist=self.config.
                                              enforcewarninglist)
                        self.misp_event.add_object(hip)
                        if email_object:
                            email_object.add_reference(hip.uuid, 'contains')
                    else:
                        if self.urlsonly is False:
                            attribute = self.misp_event.add_attribute(
                                'hostname',
                                value=hostname,
                                to_ids=ids_flag,
                                enforceWarninglist=self.config.
                                enforcewarninglist,
                                comment=comment)
                        if email_object:
                            email_object.add_reference(attribute.uuid,
                                                       'contains')

    def add_event(self):
        '''Add event on the remote MISP instance.'''

        # Add additional tags depending on others
        tags = []
        for tag in [t.name for t in self.misp_event.tags]:
            if self.config.dependingtags.get(tag):
                tags += self.config.dependingtags.get(tag)

        # Add additional tags according to configuration
        for malware in self.config.malwaretags:
            if malware.lower() in self.subject.lower():
                tags += self.config.malwaretags.get(malware)
        if tags:
            [self.misp_event.add_tag(tag) for tag in tags]

        has_tlp_tag = False
        for tag in [t.name for t in self.misp_event.tags]:
            if tag.lower().startswith('tlp'):
                has_tlp_tag = True
        if not has_tlp_tag:
            self.misp_event.add_tag(self.config.tlptag_default)

        if self.offline:
            return self.misp_event.to_json()
        event = self.misp.add_event(self.misp_event, pythonify=True)
        if self.config.sighting:
            for value, source in self.sightings_to_add:
                self.sighting(value, source)
        return event
예제 #7
0
class Query():
    def __init__(self, loglevel: int = logging.DEBUG):
        self.__init_logger(loglevel)
        self.fex = Faup()
        self.cache = Redis(unix_socket_path=get_socket_path('cache'),
                           db=1,
                           decode_responses=True)

    def __init_logger(self, loglevel) -> None:
        self.logger = logging.getLogger(f'{self.__class__.__name__}')
        self.logger.setLevel(loglevel)

    def _cache_set(self, key, value, field=None):
        if field is None:
            self.cache.setex(key, json.dumps(value), 3600)
        else:
            self.cache.hset(key, field, json.dumps(value))
            self.cache.expire(key, 3600)

    def _cache_get(self, key, field=None):
        if field is None:
            value_json = self.cache.get(key)
        else:
            value_json = self.cache.hget(key, field)
        if value_json is not None:
            return json.loads(value_json)
        return None

    def to_bool(self, s):
        """
        Converts the given string to a boolean.
        """
        return s.lower() in ('1', 'true', 'yes', 'on')

    def get_submissions(self, url, day=None):
        if day is None:
            day = date.today().isoformat()
        else:
            day = day.isoformat()
        return self.cache.zscore(f'{day}_submissions', url)

    def get_mail_sent(self, url, day=None):
        if day is None:
            day = date.today().isoformat()
        else:
            day = day.isoformat()
        self.fex.decode(url)
        host = self.fex.get_host()
        return self.cache.sismember(f'{day}_mails', host)

    def set_mail_sent(self, url, day=None):
        if day is None:
            day = date.today().isoformat()
        else:
            day = day.isoformat()
        self.fex.decode(url)
        host = self.fex.get_host()
        return self.cache.sadd(f'{day}_mails', host)

    def is_valid_url(self, url):
        cached = self._cache_get(url, 'valid')
        key = f'{date.today().isoformat()}_submissions'
        self.cache.zincrby(key, 1, url)
        if cached is not None:
            return cached
        if url.startswith('hxxp'):
            url = 'http' + url[4:]
        elif not url.startswith('http'):
            url = 'http://' + url
        logging.debug("Checking validity of URL: " + url)
        self.fex.decode(url)
        scheme = self.fex.get_scheme()
        host = self.fex.get_host()
        if scheme is None or host is None:
            reason = "Not a valid http/https URL/URI"
            return False, url, reason
        self._cache_set(url, (True, url, None), 'valid')
        return True, url, None

    def is_ip(self, host):
        try:
            ipaddress.ip_address(host)
            return True
        except ValueError:
            return False

    def try_resolve(self, url):
        self.fex.decode(url)
        host = self.fex.get_host().lower()
        if self.is_ip(host):
            return True, None
        try:
            ipaddr = dns.resolver.query(host, 'A')
        except Exception:
            reason = "DNS server problem. Check resolver settings."
            return False, reason
        if not ipaddr:
            reason = "Host " + host + " does not exist."
            return False, reason
        return True, None

    def get_urls(self, url, depth=1):
        if depth > 5:
            print('Too many redirects.')
            return

        def meta_redirect(content):
            c = content.lower()
            soup = BeautifulSoup(c, "html.parser")
            for result in soup.find_all(attrs={'http-equiv': 'refresh'}):
                if result:
                    out = result["content"].split(";")
                    if len(out) == 2:
                        wait, text = out
                        try:
                            a, url = text.split('=', 1)
                            return url.strip()
                        except Exception:
                            print(text)
            return None

        resolve, reason = self.try_resolve(url)
        if not resolve:
            # FIXME: inform that the domain does not resolve
            yield url
            return

        logging.debug(f"Making HTTP connection to {url}")

        headers = {
            'User-agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0'
        }
        try:
            response = requests.get(url,
                                    allow_redirects=True,
                                    headers=headers,
                                    timeout=15,
                                    verify=False)
        except Exception:
            # That one can fail (DNS for example)
            # FIXME: inform that the get failed
            yield url
            return
        if response.history is not None:
            for h in response.history:
                # Yeld the urls in the order we find them
                yield h.url

        yield response.url

        meta_redir_url = meta_redirect(response.content)
        if meta_redir_url is not None:
            depth += 1
            if not meta_redir_url.startswith('http'):
                self.fex.decode(url)
                base = '{}://{}'.format(self.fex.get_scheme(),
                                        self.fex.get_host())
                port = self.fex.get_port()
                if port is not None:
                    base += f':{port}'
                if not meta_redir_url.startswith('/'):
                    # relative redirect. resource_path has the initial '/'
                    if self.fex.get_resource_path() is not None:
                        base += self.fex.get_resource_path()
                if not base.endswith('/'):
                    base += '/'
                meta_redir_url = base + meta_redir_url
            for url in self.get_urls(meta_redir_url, depth):
                yield url

    def url_list(self, url):
        cached = self._cache_get(url, 'list')
        if cached is not None:
            return cached
        list_urls = []
        for u in self.get_urls(url):
            if u is None or u in list_urls:
                continue
            list_urls.append(u)
        self._cache_set(url, list_urls, 'list')
        return list_urls

    def dns_resolve(self, url):
        cached = self._cache_get(url, 'dns')
        if cached is not None:
            return cached
        self.fex.decode(url)
        host = self.fex.get_host().lower()
        ipv4 = None
        ipv6 = None
        if self.is_ip(host):
            if ':' in host:
                try:
                    socket.inet_pton(socket.AF_INET6, host)
                    ipv6 = [host]
                except Exception:
                    pass
            else:
                try:
                    socket.inet_aton(host)
                    ipv4 = [host]
                except Exception:
                    pass
        else:
            try:
                ipv4 = [str(ip) for ip in dns.resolver.query(host, 'A')]
            except Exception:
                logging.debug("No IPv4 address assigned to: " + host)
            try:
                ipv6 = [str(ip) for ip in dns.resolver.query(host, 'AAAA')]
            except Exception:
                logging.debug("No IPv6 address assigned to: " + host)
        self._cache_set(url, (ipv4, ipv6), 'dns')
        return ipv4, ipv6

    def phish_query(self, url, key, query):
        cached = self._cache_get(query, 'phishtank')
        if cached is not None:
            return cached
        postfields = {'url': quote(query), 'format': 'json', 'app_key': key}
        response = requests.post(url, data=postfields)
        res = response.json()
        if res["meta"]["status"] == "success":
            if res["results"]["in_database"]:
                self._cache_set(query, res["results"]["phish_detail_page"],
                                'phishtank')
                return res["results"]["phish_detail_page"]
            else:
                # no information
                pass
        elif res["meta"]["status"] == 'error':
            # Inform the user?
            # errormsg = res["errortext"]
            pass
        return None

    def sphinxsearch(server, port, url, query):
        # WARNING: too dangerous to have on the public interface
        return ''
        """
        if not sphinx:
            return None
        cached = _cache_get(query, 'sphinx')
        if cached is not None:
            return cached
        client = sphinxapi.SphinxClient()
        client.SetServer(server, port)
        client.SetMatchMode(2)
        client.SetConnectTimeout(5.0)
        result = []
        res = client.Query(query)
        if res.get("matches") is not None:
            for ticket in res["matches"]:
                ticket_id = ticket["id"]
                ticket_link = url + str(ticket_id)
                result.append(ticket_link)
        _cache_set(query, result, 'sphinx')
        return result

        """

    def vt_query_url(self, url, url_up, key, query, upload=True):
        cached = self._cache_get(query, 'vt')
        if cached is not None and cached[2] is not None:
            return cached
        parameters = {"resource": query, "apikey": key}
        if upload:
            parameters['scan'] = 1
        response = requests.post(url, data=parameters)
        if response.text is None or len(response.text) == 0:
            return None
        res = response.json()
        msg = res["verbose_msg"]
        link = res.get("permalink")
        positives = res.get("positives")
        total = res.get("total")
        self._cache_set(query, (msg, link, positives, total), 'vt')
        return msg, link, positives, total

    def gsb_query(self, url, query):
        cached = self._cache_get(query, 'gsb')
        if cached is not None:
            return cached
        param = '1\n' + query
        response = requests.post(url, data=param)
        status = response.status_code
        if status == 200:
            self._cache_set(query, response.text, 'gsb')
            return response.text

    '''
    def urlquery_query(url, key, query):
        return None
        cached = _cache_get(query, 'urlquery')
        if cached is not None:
            return cached
        try:
            urlquery.url = url
            urlquery.key = key
            response = urlquery.search(query)
        except Exception:
            return None
        if response['_response_']['status'] == 'ok':
            if response.get('reports') is not None:
                total_alert_count = 0
                for r in response['reports']:
                    total_alert_count += r['urlquery_alert_count']
                    total_alert_count += r['ids_alert_count']
                    total_alert_count += r['blacklist_alert_count']
                    _cache_set(query, total_alert_count, 'urlquery')
                    return total_alert_count
            else:
                return None
    '''

    def process_emails(self, emails, ignorelist, replacelist):
        to_return = list(set(emails))
        for mail in reversed(to_return):
            for ignorelist_entry in ignorelist:
                if re.search(ignorelist_entry, mail, re.I):
                    if mail in to_return:
                        to_return.remove(mail)
            for k, v in list(replacelist.items()):
                if re.search(k, mail, re.I):
                    if k in to_return:
                        to_return.remove(k)
                        to_return += v
        return to_return

    def whois(self, server, port, domain, ignorelist, replacelist):
        cached = self._cache_get(domain, 'whois')
        if cached is not None:
            return cached
        s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        s.settimeout(15)
        try:
            s.connect((server, port))
        except Exception:
            print("Connection problems - check WHOIS server")
            print(("WHOIS request while problem occurred: ", domain))
            print(("WHOIS server: {}:{}".format(server, port)))
            return None
        if domain.startswith('http'):
            self.fex.decode(domain)
            d = self.fex.get_domain().lower()
        else:
            d = domain
        s.send(("{}\r\n".format(d)).encode())
        response = b''
        while True:
            d = s.recv(4096)
            response += d
            if d == b'':
                break
        s.close()
        match = re.findall(r'[\w\.-]+@[\w\.-]+', response.decode())
        emails = self.process_emails(match, ignorelist, replacelist)
        if len(emails) == 0:
            return None
        list_mail = list(set(emails))
        self._cache_set(domain, list_mail, 'whois')
        return list_mail

    def pdnscircl(self, url, user, passwd, q):
        cached = self._cache_get(q, 'pdns')
        if cached is not None:
            return cached
        pdns = PyPDNS(url, basic_auth=(user, passwd))
        response = pdns.query(q)
        all_uniq = []
        for e in reversed(response):
            host = e['rrname'].lower()
            if host in all_uniq:
                continue
            else:
                all_uniq.append(host)
        response = (len(all_uniq), all_uniq[:5])
        self._cache_set(q, response, 'pdns')
        return response

    def psslcircl(self, url, user, passwd, q):
        cached = self._cache_get(q, 'pssl')
        if cached is not None:
            return cached
        pssl = PyPSSL(url, basic_auth=(user, passwd))
        response = pssl.query(q)
        if response.get(q) is not None:
            certinfo = response.get(q)
            entries = {}
            for sha1 in certinfo['certificates']:
                entries[sha1] = []
                if certinfo['subjects'].get(sha1):
                    for value in certinfo['subjects'][sha1]['values']:
                        entries[sha1].append(value)
            self._cache_set(q, entries, 'pssl')
            return entries
        return None

    def eupi(self, url, key, q):
        cached = self._cache_get(q, 'eupi')
        if cached is not None:
            return cached
        eu = PyEUPI(key, url)
        response = eu.search_url(url=q)
        if response.get('results'):
            r = response.get('results')[0]['tag_label']
            self._cache_set(q, r, 'eupi')
            return r
        eu.post_submission(q)
        return None

    def bgpranking(self, ip):
        cached = self._cache_get(ip, 'ipasn')
        if cached is not None:
            asn = cached['asn']
            prefix = cached['prefix']
        else:
            ipasn = IPASNHistory()
            response = ipasn.query(ip)
            if 'response' not in response:
                asn = None
                prefix = None
            entry = response['response'][list(response['response'].keys())[0]]
            if entry:
                self._cache_set(ip, entry, 'ipasn')
                asn = entry['asn']
                prefix = entry['prefix']
            else:
                asn = None
                prefix = None

        if not asn or not prefix:
            # asn, prefix, asn_descr, rank, position, known_asns
            return None, None, None, None, None, None

        cached = self._cache_get(ip, 'bgpranking')
        if cached is not None:
            return cached
        bgpranking = BGPRanking()
        response = bgpranking.query(asn,
                                    date=(date.today() -
                                          timedelta(1)).isoformat())
        if 'response' not in response or not response['response']:
            return None, None, None, None, None, None
        to_return = (asn, prefix, response['response']['asn_description'],
                     response['response']['ranking']['rank'],
                     response['response']['ranking']['position'],
                     response['response']['ranking']['total_known_asns'])
        self._cache_set(ip, to_return, 'bgpranking')
        return to_return

    def lookyloo(self, url):
        cached = self._cache_get(url, 'lookyloo')
        if cached is not None:
            return cached
        lookyloo = Lookyloo()
        lookyloo_perma_url = lookyloo.enqueue(url)
        if lookyloo_perma_url:
            self._cache_set(url, lookyloo_perma_url, 'lookyloo')
            return lookyloo_perma_url
        return None

    def _deserialize_cached(self, entry):
        to_return = {}
        redirects = []
        h = self.cache.hgetall(entry)
        for key, value in h.items():
            v = json.loads(value)
            if key == 'list':
                redirects = v
                continue
            to_return[key] = v
        return to_return, redirects

    def get_url_data(self, url):
        data, redirects = self._deserialize_cached(url)
        if data.get('dns') is not None:
            ipv4, ipv6 = data['dns']
            ip_data = {}
            if ipv4 is not None:
                for ip in ipv4:
                    info, _ = self._deserialize_cached(ip)
                    ip_data[ip] = info
            if ipv6 is not None:
                for ip in ipv6:
                    info, _ = self._deserialize_cached(ip)
                    ip_data[ip] = info
            if len(ip_data) > 0:
                data.update(ip_data)
        return {url: data}, redirects

    def cached(self, url, digest=False):
        url_data, redirects = self.get_url_data(url)
        to_return = [url_data]
        for u in redirects:
            if u == url:
                continue
            data, redir = self.get_url_data(u)
            to_return.append(data)
        if digest:
            return {'result': to_return, 'digest': self.digest(to_return)}
        return {'result': to_return}

    def ip_details_digest(self, ips, all_info, all_asns, all_mails):
        to_return = ''
        for ip in ips:
            to_return += '\t' + ip + '\n'
            data = all_info[ip]
            if data.get('bgpranking'):
                to_return += '\t\tis announced by {} ({}). Position {}/{}.\n'.format(
                    data['bgpranking'][2], data['bgpranking'][0],
                    data['bgpranking'][4], data['bgpranking'][5])
                all_asns.add('{} ({})'.format(data['bgpranking'][2],
                                              data['bgpranking'][0]))
            if data.get('whois'):
                all_mails.update(data.get('whois'))
        return to_return

    def digest(self, data):
        to_return = ''
        all_mails = set()
        all_asns = set()
        for entry in data:
            # Each URL we're redirected to
            for url, info in entry.items():
                # info contains the information we got for the URL.
                to_return += '\n{}\n'.format(url)
                if 'whois' in info:
                    all_mails.update(info['whois'])
                if 'lookyloo' in info:
                    to_return += '\tLookyloo permanent URL: {}\n'.format(
                        info['lookyloo'])
                if 'vt' in info and len(info['vt']) == 4:
                    if info['vt'][2] is not None:
                        to_return += '\t{} out of {} positive detections in VT - {}\n'.format(
                            info['vt'][2], info['vt'][3], info['vt'][1])
                    else:
                        to_return += '\t{} - {}\n'.format(
                            info['vt'][0], info['vt'][1])
                if 'gsb' in info:
                    to_return += '\tKnown as malicious on Google Safe Browsing: {}\n'.format(
                        info['gsb'])
                if 'phishtank' in info:
                    to_return += '\tKnown on PhishTank: {}\n'.format(
                        info['phishtank'])

                if 'dns' in info:
                    ipv4, ipv6 = info['dns']
                    if ipv4 is not None:
                        to_return += self.ip_details_digest(
                            ipv4, info, all_asns, all_mails)
                    if ipv6 is not None:
                        to_return += self.ip_details_digest(
                            ipv6, info, all_asns, all_mails)
        return to_return, list(all_mails), list(all_asns)