Пример #1
0
    def parse_memory_urls(self):
        self.logger.debug('Parsing memory URLs')
        memory_urls = set()
        memory_urls_json = self.parse(self.report, 'analysis',
                                      'hybridanalysis', 'ipdomainstreams',
                                      'stream')

        if memory_urls_json:
            if isinstance(memory_urls_json, dict):
                memory_urls_json = [memory_urls_json]

            for url in memory_urls_json:
                if isinstance(url, str):
                    if is_valid(url):
                        memory_urls.add(url)
                if isinstance(url, dict):
                    if 'db' in url:
                        if is_valid(url['db']):
                            memory_urls.add(url['db'])

        return sorted(list(memory_urls))
Пример #2
0
    def is_email_address_whitelisted(self,
                                     address,
                                     value_in_indicator=True,
                                     indicator_in_value=False):
        """ Returns True if the email address is whitelisted. """

        # First check if the address was already cached.
        if self._is_cached_whitelisted(address):
            return True
        if self._is_cached_nonwhitelisted(address):
            return False

        # Check if the address is valid.
        email_pattern = re.compile(
            r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,63}')
        try:
            if not email_pattern.match(address):
                self._add_whitelisted_cache(address)
                self.logger.debug('Invalid e-mail address: {}'.format(address))
                return True
        except:
            self._add_whitelisted_cache(address)
            return True

        # Check if the domain is valid.
        try:
            domain = address.split('@')[1]
            if not is_valid(domain):
                self._add_whitelisted_cache(address)
                self.logger.debug(
                    'Invalid e-mail address domain: {}'.format(address))
                return True
        except:
            self._add_whitelisted_cache(address)
            return True

        return self._is_whitelisted(address, [
            'Email - Address', 'WHOIS Registrant Email Address',
            'Email Address From', 'Email Address Sender'
        ],
                                    value_in_indicator=value_in_indicator,
                                    indicator_in_value=indicator_in_value)
Пример #3
0
    def is_domain_whitelisted(self,
                              domain,
                              value_in_indicator=False,
                              indicator_in_value=True):
        """ Returns True if the domain has an invalid TLD or is whitelisted. """

        # First check if the domain was already cached.
        if self._is_cached_whitelisted(domain):
            return True
        if self._is_cached_nonwhitelisted(domain):
            return False

        # Check if the domain has a valid TLD.
        if not is_valid(domain):
            self._add_whitelisted_cache(domain)
            self.logger.debug('Invalid domain: {}'.format(domain))
            return True

        return self._is_whitelisted(domain, ['URI - Domain Name'],
                                    value_in_indicator=value_in_indicator,
                                    indicator_in_value=indicator_in_value)
Пример #4
0
def dedup_reports(report_list, whitelist):
    """ Merge a list of BaseSandboxParser subclass objects to make a single generic report. """

    logger = logging.getLogger()
    logger.debug('Deduping sandbox report list')

    # Create the new generic report.
    dedup_report = BaseSandboxParser()

    for report in report_list:
        dedup_report.sandbox_urls += report.sandbox_urls

        if report.filename and not report.filename == 'sample':
            dedup_report.filename = report.filename

        if report.md5:
            dedup_report.md5 = report.md5
            dedup_report.indicators.append(
                Indicator('Hash - MD5',
                          dedup_report.md5,
                          tags=['sandboxed_sample']))

        if report.sha1:
            dedup_report.sha1 = report.sha1
            dedup_report.indicators.append(
                Indicator('Hash - SHA1',
                          dedup_report.sha1,
                          tags=['sandboxed_sample']))

        if report.sha256:
            dedup_report.sha256 = report.sha256
            dedup_report.indicators.append(
                Indicator('Hash - SHA256',
                          dedup_report.sha256,
                          tags=['sandboxed_sample']))

        if report.sha512:
            dedup_report.sha512 = report.sha512

        if report.ssdeep:
            dedup_report.ssdeep = report.ssdeep
            dedup_report.indicators.append(
                Indicator('Hash - SSDEEP',
                          dedup_report.ssdeep,
                          tags=['sandboxed_sample']))

        dedup_report.malware_family += report.malware_family

        # Dedup the contacted hosts.
        for host in report.contacted_hosts:
            if not host in dedup_report.contacted_hosts:
                dedup_report.contacted_hosts.append(host)
                tags = ['contacted_host']
                if host['protocol'] and host['port']:
                    tags.append('{} {}'.format(host['protocol'], host['port']))
                elif host['protocol']:
                    tags.append(host['protocol'])

                # For now we consider ALL contacted hosts to be benign, so no need to check the whitelist.
                dedup_report.indicators.append(
                    Indicator('Address - ipv4-addr',
                              host['ipv4'],
                              status='Informational',
                              tags=tags))

        # Dedup the dropped files.
        for file in report.dropped_files:

            # Dropped files are harder than the other items to properly whitelist, so we will
            # initially restrict them to certain file names or file types that we care about.
            if any(name in file['filename']
                   for name in dedup_report.good_dropped_file_names) or any(
                       t in file['type']
                       for t in dedup_report.good_dropped_file_types):
                if not file in dedup_report.dropped_files:
                    dedup_report.dropped_files.append(file)

                    # If any part of the dropped file is whitelisted, make sure we mark all parts as whitelisted.
                    if whitelist.is_dropped_file_whitelisted(file):
                        status = 'Whitelisted'
                        file['status'] = 'Whitelisted'
                    else:
                        status = 'New'

                    dedup_report.indicators.append(
                        Indicator('Windows - FileName',
                                  file['filename'],
                                  status=status,
                                  tags=['dropped_file']))
                    dedup_report.indicators.append(
                        Indicator('Hash - MD5',
                                  file['md5'],
                                  status=status,
                                  tags=['dropped_file'],
                                  relationships=[file['sha1'],
                                                 file['sha256']]))
                    dedup_report.indicators.append(
                        Indicator('Hash - SHA1',
                                  file['sha1'],
                                  status=status,
                                  tags=['dropped_file'],
                                  relationships=[file['md5'], file['sha256']]))
                    dedup_report.indicators.append(
                        Indicator('Hash - SHA256',
                                  file['sha256'],
                                  status=status,
                                  tags=['dropped_file'],
                                  relationships=[file['md5'], file['sha1']]))

        # Dedup the HTTP requests.
        for request in report.http_requests:
            if not request in dedup_report.http_requests:
                dedup_report.http_requests.append(request)
                dedup_report.indicators += make_url_indicators(
                    [request['url']], tags=['http_request', request['method']])

        # Dedup the DNS requests.
        for request in report.dns_requests:
            if not request in dedup_report.dns_requests:
                dedup_report.dns_requests.append(request)

                # If any part of the DNS request is whitelisted, make sure we mark all parts as whitelisted.
                if whitelist.is_dns_request_whitelisted(request):
                    status = 'Whitelisted'
                else:
                    status = 'New'

                # For now we consider ALL request IP addresses to be benign, so no need to check the whitelist.
                dedup_report.indicators.append(
                    Indicator('URI - Domain Name',
                              request['request'],
                              tags=['dns_request']))
                try:
                    ipaddress.ip_address(request['answer'])
                    dedup_report.indicators.append(
                        Indicator('Address - ipv4-addr',
                                  request['answer'],
                                  tags=['dns_response'],
                                  status='Informational',
                                  relationships=[request['request']]))
                except:
                    pass

        # Dedup the memory strings.
        dedup_report.memory_strings += report.memory_strings
        dedup_report.memory_strings = sorted(
            list(set(dedup_report.memory_strings)))

        # Dedup the memory URLs.
        dedup_report.memory_urls += report.memory_urls
        dedup_report.memory_urls = list(set(dedup_report.memory_urls))
        dedup_report.memory_urls = [
            u for u in dedup_report.memory_urls if is_valid(u)
        ]
        dedup_report.indicators += make_url_indicators(
            dedup_report.memory_urls, tags=['url_in_memory'])

        # Dedup the strings URLs.
        dedup_report.strings_urls += report.strings_urls
        dedup_report.strings_urls = list(set(dedup_report.strings_urls))
        dedup_report.strings_urls = [
            u for u in dedup_report.strings_urls if is_valid(u)
        ]
        dedup_report.indicators += make_url_indicators(
            dedup_report.strings_urls, tags=['url_in_strings'])

        # Dedup the mutexes.
        dedup_report.mutexes += report.mutexes
        dedup_report.mutexes = list(set(dedup_report.mutexes))

        # Dedup the resolved APIs.
        dedup_report.resolved_apis += report.resolved_apis
        dedup_report.resolved_apis = list(set(dedup_report.resolved_apis))

        # Dedup the created services.
        dedup_report.created_services += report.created_services
        dedup_report.created_services = list(set(
            dedup_report.created_services))

        # Dedup the started services.
        dedup_report.started_services += report.started_services
        dedup_report.started_services = list(set(
            dedup_report.started_services))

        # Add the process tree as-is.
        dedup_report.process_trees.append(report.process_tree)

        # Try to decode base64 chunks in the process tree.
        process_tree_decoded = report.process_tree
        for chunk in report.process_tree.split():
            try:
                decoded_chunk = base64.b64decode(chunk).decode('utf-8')
                if '\x00' in decoded_chunk:
                    decoded_chunk = base64.b64decode(chunk).decode('utf-16')
                process_tree_decoded = process_tree_decoded.replace(
                    chunk, decoded_chunk)
            except:
                pass
        dedup_report.process_trees_decoded.append(process_tree_decoded)

        # Remove ` backtick and other basic Powershell obfuscation.
        new_trees = []
        for decoded_process_tree in dedup_report.process_trees_decoded:
            if 'powershell' in decoded_process_tree.lower():
                new_trees.append(decoded_process_tree.replace('`', ''))
        dedup_report.process_trees_decoded += new_trees

        # Remove Powershell string formatter obfuscation.
        new_trees = []
        for decoded_process_tree in dedup_report.process_trees_decoded:
            formatter_pattern = re.compile(
                r'(\([\'\"](({(\d+)})+)[\'\"]\s*\-f\s*(([\'\"][^\'\"]+[\'\"],*)+)\))',
                re.IGNORECASE)
            results = formatter_pattern.findall(decoded_process_tree)
            if results:
                for result in results:
                    """ ('("{0}{1}"-f\'JDxA\',\'QDc\')', '{0}{1}', '{1}', '1', "'JDxA','QDc'", "'QDc'") """
                    full_match = result[0]
                    order = result[1][1:-1]  # 0}{1
                    items = result[4]  # "'JDxA','QDc'"

                    order_list = order.split('}{')
                    order_ints = [int(x) for x in order_list]

                    items_list = [
                        i.replace('\'', '').replace('"', '')
                        for i in items.split(',')
                    ]

                    if len(order_ints) == len(items_list):
                        deobfuscated_string = ''
                        for i in order_ints:
                            deobfuscated_string += items_list[i]
                        decoded_process_tree = decoded_process_tree.replace(
                            full_match, deobfuscated_string)
                new_trees.append(decoded_process_tree)
        dedup_report.process_trees_decoded += new_trees

        # Try to decode string .split() obfuscation (used by Emotet and others)
        new_trees = []
        for decoded_process_tree in dedup_report.process_trees_decoded:
            if 'split' in decoded_process_tree.lower():
                try:
                    split_char_pattern = re.compile(
                        r'\.[\'\"]*split[\'\"]*\([\'\"\s]*(.*?)[\'\"\s]*\)',
                        re.IGNORECASE)
                    try:
                        split_char = str(
                            split_char_pattern.search(
                                decoded_process_tree).group(1))
                    except AttributeError:
                        split_char = None
                    if split_char:
                        new_process_tree_decoded = ' '.join(
                            decoded_process_tree.split(split_char))
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            "'+'", '')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '"+"', '')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '\'', ' ')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '\"', ' ')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '. ', ' ')
                        new_trees.append(new_process_tree_decoded)
                except:
                    logger.exception(
                        'Could not find process tree split() character.')
        dedup_report.process_trees_decoded += new_trees

        # Try to decode string .invoke() obfuscation (used by Emotet and others)
        new_trees = []
        for decoded_process_tree in dedup_report.process_trees_decoded:
            if 'invoke' in decoded_process_tree.lower():
                try:
                    split_char_pattern = re.compile(
                        r'\.[\'\"]*invoke[\'\"]*\([\'\"\s]*(.*?)[\'\"\s]*\)',
                        re.IGNORECASE)
                    try:
                        split_char = str(
                            split_char_pattern.search(
                                decoded_process_tree).group(1))
                    except AttributeError:
                        split_char = None
                    if split_char:
                        new_process_tree_decoded = ' '.join(
                            decoded_process_tree.split(split_char))
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            "'+'", '')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '"+"', '')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '\'', ' ')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '\"', ' ')
                        new_process_tree_decoded = new_process_tree_decoded.replace(
                            '. ', ' ')
                        new_trees.append(new_process_tree_decoded)
                except:
                    logger.exception(
                        'Could not find process tree invoke() character.')
        dedup_report.process_trees_decoded += new_trees

        # Dedup the process tree URLs. Start by just adding the URLs from each report.
        dedup_report.process_tree_urls += report.process_tree_urls
        # Find the URLs in each decoded process tree.
        for decoded_tree in dedup_report.process_trees_decoded:
            urls = find_urls(decoded_tree)
            # Remove any URL that has these URLs as substrings, since it's probably a bogus
            # URL from the original, non-decoded process tree.
            for u in report.process_tree_urls:
                if any(decoded_url in u for decoded_url in urls):
                    try:
                        dedup_report.process_tree_urls.remove(u)
                        logger.debug(
                            'Removing bad process tree URL: {}'.format(u))
                    except:
                        pass
            dedup_report.process_tree_urls += urls
        dedup_report.process_tree_urls = list(
            set(dedup_report.process_tree_urls))
        dedup_report.process_tree_urls = [
            u for u in dedup_report.process_tree_urls if is_valid(u)
        ]
        dedup_report.indicators += make_url_indicators(
            dedup_report.process_tree_urls, tags=['url_in_process_tree'])

        # Add the screenshot URLs as-is.
        if report.screenshot_path:
            dedup_report.screenshot_paths.append(report.screenshot_path)

    return dedup_report
Пример #5
0
    def is_url_whitelisted(self,
                           u,
                           value_in_indicator=False,
                           indicator_in_value=False):
        """ Returns True if the URL is invalid or is whitelisted. """

        # First check if the URL was already cached.
        if self._is_cached_whitelisted(u):
            return True
        if self._is_cached_nonwhitelisted(u):
            return False

        # Check if the URL is valid.
        if not is_valid(u):
            self._add_whitelisted_cache(u)
            self.logger.debug('Invalid URL: {}'.format(u))
            return True

        # Split the URL and check each part against the whitelist.
        split_url = urlsplit(u)

        # First check if the netloc has a ':' in it, which indicates that
        # there is a port number specified. We need to remove that in order
        # to properly check it against the whitelists.
        if ':' in split_url.netloc:
            netloc = split_url.netloc.split(':')[0]
        else:
            netloc = split_url.netloc

        # Look for the edge case of the URL having a username:password notation.
        if ':' in split_url.netloc and '@' in split_url.netloc:
            user_pass = re.compile(r'(.*?:.*?@)').findall(split_url.netloc)[0]
            user_pass_url = u.replace(user_pass, '')
            split_url = urlsplit(user_pass_url)
            netloc = split_url.netloc

        # Check the netloc. Check if it is an IP address.
        try:
            ipaddress.ip_address(netloc)
            if self.is_ip_whitelisted(netloc):
                self._add_whitelisted_cache(u)
                self.logger.debug(
                    'URL whitelisted because of IP: {}'.format(u))
                return True
        # If we got an exception, it must be a domain name.
        except:
            if self.is_domain_whitelisted(netloc):
                self._add_whitelisted_cache(u)
                self.logger.debug(
                    'URL whitelisted because of domain: {}'.format(u))
                return True

        # Check the URI path if it exists.
        if split_url.path and split_url.path != '/':
            if self.is_uri_path_whitelisted(split_url.path):
                self._add_whitelisted_cache(u)
                self.logger.debug(
                    'URL whitelisted because of path: {}'.format(u))
                return True

        # Check the URI query if it exists.
        if split_url.query:
            if self.is_uri_path_whitelisted(split_url.query):
                self._add_whitelisted_cache(u)
                self.logger.debug(
                    'URL whitelisted because of query: {}'.format(u))
                return True

        # Finally check the entire URL.
        return self._is_whitelisted(u, ['URI - URL'],
                                    value_in_indicator=value_in_indicator,
                                    indicator_in_value=indicator_in_value)
Пример #6
0
    def __init__(self, smtp_path, whitelist):

        # Initiate logging.
        self.logger = logging.getLogger()

        # Save the whitelist.
        self.whitelist = whitelist

        # Items we parse out of the email.
        self.ace_url = ''
        self.attachments = []
        self.body = ''
        self.cc_addresses = []
        self.envelope_from = ''
        self.envelope_to = ''
        self.from_address = ''
        self.headers = ''
        self.html = ''
        self.indicators = []
        self.message_id = ''
        self.original_recipient = ''
        self.path = smtp_path
        self.received = ''
        self.received_time = ''
        self.remediated = False
        self.reply_to = ''
        self.return_path = ''
        self.screenshots = []
        self.subject = ''
        self.subject_decoded = ''
        self.to_addresses = []
        self.urls = []
        self.x_auth_id = ''
        self.x_mailer = ''
        self.x_original_sender = ''
        self.x_originating_ip = ''
        self.x_sender = ''
        self.x_sender_id = ''
        self.x_sender_ip = ''

        # Build the URL to the ACE alert.
        ace_uuid_pattern = re.compile(r'([a-f0-9]{8}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{4}-[a-f0-9]{12})')
        match = ace_uuid_pattern.search(self.path)
        if match:
            self.ace_url = '{}{}'.format(config['ace']['ace_alert_url'], match.group(1))

        with open(self.path, encoding='utf-8', errors='ignore') as s:
            smtp_stream = s.read().splitlines()

        # Locate any screenshots for this email.
        email_dir = os.path.dirname(self.path)
        files = os.listdir(email_dir)
        for f in files:
            if 'text_html' in f and f.endswith('.png') and not f.startswith('email_screenshot'):
                self.logger.debug('Found email screenshot: {}'.format(os.path.join(email_dir, f)))
                self.screenshots.append(os.path.join(email_dir, f))

        # Find the envelope from/to addresses. This will only work if given an
        # "smtp.stream" file, since otherwise the SMTP commands will not exist.
        envelope_address_pattern = re.compile(r'.*<(.*)>.*')
        for line in smtp_stream:
            if line.startswith('MAIL FROM:'):
                try:
                    self.envelope_from = envelope_address_pattern.match(line).group(1)
                except:
                    self.logger.exception('Unable to parse envelope from.')
            if line.startswith('RCPT TO:'):
                try:
                    self.envelope_to = envelope_address_pattern.match(line).group(1)
                except:
                    self.logger.exception('Unable to parse envelope to.')

        # Just in case we are dealing with an "smtp.stream" file that still has
        # the SMTP commands above the actual e-mail, we need to strip those out.
        # This will remove all lines prior to the Received: headers so that the
        # email.parser can properly parse out the e-mail. If we were given an
        # "smtp.email" type of file with the SMTP commands already removed, this
        # should not affect anything. This is legacy code at this point.
        try:
            while not smtp_stream[0].startswith('Received:'):
                smtp_stream.pop(0)
        except IndexError:
            smtp_stream = []

        # Join the header lines into a single string.
        self.email_text = '\n'.join(smtp_stream)

        # Create the e-mail object.
        email_obj = email.message_from_string(self.email_text)

        # We want to try and parse an embedded/attached e-mail if there is one.
        # Walk the full e-mail's parts.
        for part in email_obj.walk():
            # Continue if the part looks like a valid e-mail.
            if part.get_content_type() == 'message/rfc822':
                # Split the part lines into a list.
                part_text = str(part).splitlines()
                if any('Received:' in line for line in part_text):
                    # Make sure our part starts with the Received: headers.
                    while not part_text[0].startswith('Received:'):
                        part_text.pop(0)
                    part_text = '\n'.join(part_text)

                    # Make the new e-mail object.
                    email_obj = email.message_from_string(part_text)

        # Parse the e-mail object for its content.
        parsed_email = self._parse_content(email_obj)

        # Now that we have the e-mail object, parse out some of the interesting parts.
        self.headers = self._get_all_headers_string(email_obj)
        self.received = self.get_header(email_obj, 'received')

        # Get the e-mail's plaintext body, HTML body, and the visible text from the HTML.
        self.body = parsed_email['body']
        self.html = parsed_email['html']

        # Get any e-mail attachments.
        self.attachments = parsed_email['attachments']

        # From address
        try:
            self.from_address = self._get_address_list(email_obj, 'from')[0][1]
            self.indicators.append(Indicator('Email - Address', self.from_address, tags=['from_address']))
        except:
            pass

        # From domain
        try:
            self.indicators.append(Indicator('URI - Domain Name', self.from_address.split('@')[1], tags=['from_domain']))
        except:
            pass

        # Reply-To address
        try:
            self.reply_to = self._get_address_list(email_obj, 'reply-to')[0][1]
            self.indicators.append(Indicator('Email - Address', self.reply_to, tags=['reply_to']))
        except:
            pass

        # X-Sender address
        try:
            self.x_sender = self._get_address_list(email_obj, 'X-Sender')[0][1]
            self.indicators.append(Indicator('Email - Address', self.x_sender, tags=['x_sender']))
        except:
            pass

        # X-Sender-Id address
        try:
            self.x_sender_id = self._get_address_list(email_obj, 'X-Sender-Id')[0][1]
            self.indicators.append(Indicator('Email - Address', self.x_sender_id, tags=['x_sender_id']))
        except:
            pass

        # X-Auth-Id address
        try:
            self.x_auth_id = self._get_address_list(email_obj, 'X-Auth-ID')[0][1]
            self.indicators.append(Indicator('Email - Address', self.x_auth_id, tags=['x_auth_id']))
        except:
            pass

        # Return-Path address
        try:
            self.return_path = self._get_address_list(email_obj, 'return_path')[0][1]
            self.indicators.append(Indicator('Email - Address', self.return_path, tags=['return_path']))
        except:
            pass

        # X-MS-Exchange-Organization-OriginalEnvelopeRecipients address
        try:
            self.original_recipient = self._get_address_list(email_obj, 'X-MS-Exchange-Organization-OriginalEnvelopeRecipients')[0][1].lower()
            self.indicators.append(Indicator('Email - Address', self.original_recipient, status='Informational', tags=['original_recipient']))
        except:
            pass
        # If the original_recipient was not found, check if this is a POTENTIAL PHISH e-mail and use the from address.
        if not self.original_recipient and 'Subject: [POTENTIAL PHISH]' in self.email_text:
            try:
                temp_email_obj = email.message_from_string(self.email_text)
                self.original_recipient = self._get_address_list(temp_email_obj, 'from')[0][1]
                self.indicators.append(Indicator('Email - Address', self.original_recipient, status='Informational', tags=['original_recipient']))
            except:
                self.logger.exception('Error parsing original recipient from POTENTIAL PHISH e-mail.')

        # Subject
        try:
            self.subject = ''.join(self.get_header(email_obj, 'subject')[0].splitlines())
            self.indicators.append(Indicator('Email - Subject', self.subject))
        except:
            pass

        # Decoded subject
        try:
            self.subject_decoded = ''.join(str(make_header(decode_header(self.get_header(email_obj, 'subject')[0]))).splitlines())
            self.indicators.append(Indicator('Email - Subject', self.subject_decoded))
        except:
            pass

        # To addresses
        self.to_addresses = [x[1].lower() for x in self._get_address_list(email_obj, 'to')]

        # CC addresses
        self.cc_addresses = [x[1].lower() for x in self._get_address_list(email_obj, 'cc')]

        # Message-Id
        try:
            self.message_id = self.get_header(email_obj, 'message-id')[0]
            self.indicators.append(Indicator('Email Message ID', self.message_id, status='Informational'))
        except:
            pass

        # X-Mailer
        try:
            self.x_mailer = self.get_header(email_obj, 'x-mailer')[0]
            self.indicators.append(Indicator('Email - Xmailer', self.x_mailer, status='Informational'))
        except:
            pass

        # X-Original-Sender address
        try:
            self.x_original_sender = self.get_header(email_obj, 'x-original-sender')[0]
            self.indicators.append(Indicator('Email - Address', self.x_original_sender, tags=['x_original_sender']))
        except:
            pass

        # X-Originating-Ip
        try:
            x_originating_ip = self.get_header(email_obj, 'x-originating-ip')[0]
            # Sometimes this field is in the form: [1.1.1.1]
            # Make sure we remove any non-IP characters.
            ip = RegexHelpers.find_ip_addresses(x_originating_ip)
            if ip:
                self.x_originating_ip = ip[0]
                self.indicators.append(Indicator('Address - ipv4-addr', self.x_originating_ip, tags=['x_originating_ip']))
        except:
            pass

        # X-Sender-Ip
        try:
            x_sender_ip = self.get_header(email_obj, 'x-sender-ip')[0]
            # Make sure like the X-Originating-IP that we only
            # get the IP address and no other characters.
            ip = RegexHelpers.find_ip_addresses(x_sender_ip)
            if ip:
                self.x_sender_ip = ip[0]
                self.indicators.append(Indicator('Address - ipv4-addr', self.x_sender_ip, tags=['x_sender_ip']))
        except:
            pass

        self.received_time = self._get_received_time(email_obj)
        if not self.received_time:
            self.received_time = self._get_date_time()

        # Find any URLs in the plaintext body.
        text_urls = find_urls(self.body)

        # Find any URLs in the HTML body.
        html_urls = find_urls(self.html)

        # Get any strings URLs.
        strings_urls = []
        """
        for file in self.attachments:
            try:
                strings_urls += file['strings_urls']
            except:
                pass
        """

        # Try and remove any URLs that look like partial versions of other URLs.
        all_urls = text_urls + html_urls + strings_urls
        unique_urls = set()
        for u in all_urls:
            if not any(other_url.startswith(u) and other_url != u for other_url in all_urls):
                unique_urls.add(u)

        # Get rid of any invalid URLs.
        self.urls = [u for u in unique_urls if is_valid(u)]

        # Make indicators for the URLs.
        self.indicators += make_url_indicators(self.urls)

        # Get rid of any invalid indicators.
        self.indicators = [i for i in self.indicators if i.value]

        # Add any extra tags to each indicator.
        for i in self.indicators:
            i.tags.append('phish')
Пример #7
0
def make_url_indicators(urls, tags=[]):
    """ Make indicators from a list of URLs. """
    logger = logging.getLogger(__name__)

    if isinstance(urls, str):
        urls = [urls]

    indicators = []

    for u in set(urls):
        if is_valid(u):
            parsed_url = urlsplit(u)
            url_without_query = parsed_url.scheme + '://' + parsed_url.netloc + parsed_url.path

            url_variations = set()
            url_variations.add(u)
            url_variations.add(url_without_query)

            for u in url_variations:
                """
                # If the URL is whitelisted, we want to make sure that we mark its component parts
                # (the netloc and the path/query) as Informational. We don't want to mark them as
                # Whitelisted since, for example, there can be cases where some URI paths from a
                # given domain are good and others are not. (See: dropbox.com)
                if whitelist.is_url_whitelisted(u):
                    status = 'Informational'
                else:
                    status = 'New'
                """
                status = 'New'

                # Hacky way to deal with URLs that have a username:password notation.
                user_pass_url = ''

                parsed_url = urlsplit(u)

                # First check if the netloc has a ':' in it, which indicates that
                # there is a port number specified. We need to remove that in order
                # to properly create indicators for it.
                if ':' in parsed_url.netloc:
                    netloc = parsed_url.netloc.split(':')[0]
                else:
                    netloc = parsed_url.netloc

                # Look for the edge case of the URL having a username:password notation.
                try:
                    if ':' in parsed_url.netloc and '@' in parsed_url.netloc:
                        user_pass = re.compile(r'(.*?:.*?@)').findall(
                            parsed_url.netloc)[0]
                        user_pass_url = u.replace(user_pass, '')
                        parsed_url = urlsplit(user_pass_url)
                        netloc = parsed_url.netloc
                except:
                    pass

                # Domain/IP
                try:
                    ipaddress.ip_address(netloc)
                    indicators.append(
                        Indicator('Address - ipv4-addr',
                                  netloc,
                                  status=status,
                                  tags=tags + ['ip_in_url'],
                                  relationships=[u]))
                except ValueError:
                    indicators.append(
                        Indicator('URI - Domain Name',
                                  netloc,
                                  status=status,
                                  tags=tags + ['domain_in_url'],
                                  relationships=[u]))

                # TLD
                tld = get_fld('http://{}'.format(netloc), fail_silently=True)
                if tld:
                    indicators.append(
                        Indicator('URI - Domain Name',
                                  tld,
                                  status=status,
                                  tags=tags,
                                  relationships=[u]))

                # Full URL
                indicators.append(Indicator('URI - URL', u, tags=tags))

                # Path
                indicators.append(
                    Indicator('URI - Path',
                              parsed_url.path,
                              status=status,
                              tags=tags,
                              relationships=[u, parsed_url.netloc]))
                try:
                    decoded_path = urllib.parse.unquote(parsed_url.path)
                    if not decoded_path == parsed_url.path:
                        indicators.append(
                            Indicator('URI - Path',
                                      decoded_path,
                                      status=status,
                                      tags=tags,
                                      relationships=[u, parsed_url.netloc]))
                except:
                    pass

                # Query
                indicators.append(
                    Indicator('URI - Path',
                              parsed_url.query,
                              status=status,
                              tags=tags,
                              relationships=[u, parsed_url.netloc]))

    good_indicators = [i for i in set(indicators) if i.value]

    return good_indicators