예제 #1
0
 def test_ipextract(self):
     logging.debug("Performing HTML extract tests")
     self.assertTrue(IOCRegex.search(IP, DATA))
     results = IOCRegex.extract_all_possible(DATA)
     self.assertTrue(all([i in IP_RESULTS for i in results[IP]]))
     self.assertTrue(all([i in DOMAIN_RESULTS for i in results[DOMAIN]]))
     self.assertTrue(
         all([i in DF_DOMAIN_RESULTS for i in results[DF_DOMAIN]]))
예제 #2
0
 def test_ipextract(self):
     logging.debug(
         "Performing extract tests on things that failed and shouldnt")
     self.assertTrue(IOCRegex.search(URL_POT, DATA2))
     w, dw = IOCRegex.extract_value(URL_POT, DATA2)
     self.assertTrue(len(w) == 0 and len(dw) == 1)
     self.assertTrue(dw[0] == IOCRegex.defang(DATA2))
     results = IOCRegex.extract_all_possible(DATA2)
     self.assertTrue('badguys.com.tr' in results[DF_DOMAIN])
예제 #3
0
 def test_ipextract(self):
     logging.debug("Performing IP extract tests")
     self.assertTrue(IOCRegex.search(IP, DATA))
     results = IOCRegex.extract_all_possible(DATA)
     self.assertTrue(len(results[DF_IP]) == 1)
     self.assertTrue(len(results[IP]) == 1)
     ip = results[IP][0]
     self.assertTrue(ip == EXP_IP_VALUE)
     ip = results[DF_IP][0]
     self.assertTrue(ip == DF_EXP_IP_VALUE)
    def extract_contents(self, content):
        # get some content from the message and replace \u2026
        # DEBUG_CONTENT.append(content)
        # RE extract entities (hosts, domains, IPs, s), links
        replace_chars = ['''`~!@#$^&*()'"{}''']
        new_content = content
        for c in replace_chars:
            new_content = new_content.replace(c, ' ')
        rexk = self.regex_keywords
        entities = IOCREX.extract_all_possible(new_content, addl_keywords=rexk)

        domains = entities[consts.DOMAIN] + entities[consts.DF_DOMAIN]
        hashes_s = [consts.MD5, consts.SHA1, consts.SHA256, consts.SHA512]

        for k in hashes_s:
            del entities[k]

        tags = [i.strip('#') for i in entities[consts.HASH_TAG]]
        entities[consts.HASH_TAG] = tags
        entities['processed_tags'] = self.hashtags_with_tags(tags)
        entities['site_rankings'] = self.lookup_sites_rank(domains)

        # us = self.expand_twitter_urls_extract_content(entities[consts.URL])
        # entities['linked_content'] = us

        keys = [consts.DOMAIN, consts.DF_DOMAIN]
        for k in keys:
            new_items = []
            _t = entities.get(k, [])
            new_items = [d for d in _t if not IOCREX.filter_domain(d)]
            entities[k] = new_items

        keys = [consts.URL, consts.DF_URL, consts.URL_POT, consts.DF_URL_POT]
        for k in keys:
            new_items = []
            urls = entities.get(k, [])
            _s = zip(urls, IOCREX.hosts_from_urls(urls, True))
            new_items = [u for u, d in _s if not IOCREX.filter_domain(d)]
            entities[k] = new_items

        edip = entities[consts.DF_DOMAIN] + entities[consts.DF_IP]
        adip = []  # us[consts.DF_DOMAIN] + us[consts.DF_IP]
        ht = entities[consts.HASH_TAG]  # + us[consts.HASH_TAG]
        kw = entities[consts.KEYWORDS]  # + us[consts.KEYWORDS]
        safe_hosts = [i.replace('.', '[.]') for i in edip + adip]

        entities[consts.HASH_TAG] = ht
        entities[consts.KEYWORDS] = kw
        entities['safe_hosts'] = safe_hosts
        good_message = IOCREX.is_good_result(entities)
        return good_message, entities
예제 #5
0
    def extract_embedded_links(self):
        urls = set()
        el_r = {consts.LINKS: [], consts.DOMAINS: [], consts.IPS: []}
        for link in BeautifulSoup(self.content,
                                  'html.parser',
                                  parse_only=SoupStrainer('a')):
            line = None
            if 'href' in link:
                line = link['attrs']['href']

            if line is None or len(line) < 3:
                continue

            el = IOCREX.extract_link(line)
            el_r[consts.LINKS] = el_r[consts.LINKS] + el[consts.LINKS]
            el_r[consts.DF_LINKS] = el_r[consts.DF_LINKS] + el[consts.DF_LINKS]

            if len(el[consts.DF_LINKS]) > 0:
                for info in el[consts.DF_LINKS]:
                    url = info[consts.URL]
                    if url in urls:
                        continue
                    urls.add(url)

                    hi = IOCREX.extract_domain_or_host(url)
                    _t = el_r[consts.DF_IPS] + [i for i in hi[consts.DF_IPS]]
                    el_r[consts.DF_IPS] = _t
                    qs = [i for i in hi[consts.DF_DOMAINS]]
                    el_r[consts.DF_DOMAINS] = el_r[consts.DF_DOMAINS] + qs
                    x = el_r[consts.IPS] + [i for i in hi[consts.IPS]]
                    el_r[consts.DF_IPS] = sorted(set(x))
                    y = el_r[consts.DOMAINS] + [i for i in hi[consts.DOMAINS]]
                    el_r[consts.DF_DOMAINS] = sorted(set(y))

            if len(el[consts.LINKS]) > 0:
                for info in el[consts.LINKS]:
                    url = info[consts.URL]
                    if url in urls:
                        continue
                    urls.add(url)

                    hi = IOCREX.extract_domain_or_host(url)
                    x = el_r[consts.IPS] + [i for i in hi[consts.IPS]]
                    el_r[consts.IPS] = sorted(set(x))
                    y = el_r[consts.DOMAINS] + [i for i in hi[consts.DOMAINS]]
                    el_r[consts.DOMAINS] = sorted(set(y))
        return el_r
예제 #6
0
    def __init__(self, link=None, content=None, look_at_embedded_links=True):
        if content is None and link is None:
            raise Exception("Provide either a link or content to analyze")

        self.link = link
        self.expanded_link = False
        self.orig_link = link
        self.content = content
        self.content_type = 'html'
        self.response = None
        self.bs4_parser = None
        if self.link is not None and self.content is None:
            # download the link
            self.response = requests.get(self.link)
            # self.response = requests.get(self.link, headers=consts.HEADERS())
            self.content_type = self.response.headers['content-type']
            # read the contents
            if self.response.status_code == 200:
                self.content = self.response.text
                self.link = self.response.request.url
                logging.debug("Expanded link to: %s" % self.link)
                self.expanded_link = self.orig_link != self.link
            else:
                _m = "Unable to get the specified content:" +\
                     " HTTP STATUS CODE = %d"
                raise Exception(_m % self.response.status_code)

        if self.content_type.find('html') > -1 or \
           self.content_type.find('text/plain') > -1:
            self.bs4_parser = BeautifulSoup(self.content, 'html.parser')
        elif self.content_type.find('json'):
            # create key value mappings line by line
            json_data = json.loads(self.content)
            self.content = json.dumps(json_data, indent=0, sort_keys=True)
            self.bs4_parser = BeautifulSoup(self.content, 'html.parser')

        try:
            self.embedded_links = self.extract_embedded_links()
        except:
            self.embedded_links = []

        try:
            self.artifacts = IOCREX.extract_all_possible(content)
        except:
            self.artifacts = {}
예제 #7
0
    def expand_twitter_urls_extract_content(self, urls):
        expansions = []
        content_artifacts = {'expansions': expansions}
        for k in ARTIFACT_KEYS:
            content_artifacts[k] = []

        for url in urls:
            # Mongo does not like '.' in names so
            # I need to dodge that for the time being
            # was expansions = {}; expansions[url] = ...
            results = {
                'url': url,
                'failed': True,
                'expanded_link': None,
                'link': None,
                'orig_link': None,
                'content_type': None,
                'artifacts': {},
            }
            if not self.grab_linked_content:
                expansions.append(results)
                continue

            try:
                logging.debug("Attempting to download url: %s" % url)
                ch = CH(link=url)
                results['failed'] = False
                # results['content'] = ch.content
                results['expanded_link'] = ch.expanded_link
                results['link'] = ch.link
                results['orig_link'] = url
                results['content_type'] = ch.content_type
                results['artifacts'] = ch.artifacts
            except:
                _m = 'Failed with the following exception:\n{}'
                logging.debug(_m.format(traceback.format_exc()))
            expansions.append(results)

        for expansion in expansions:
            artifacts = expansion['artifacts']
            for k in ARTIFACT_KEYS:
                items = content_artifacts[k] + artifacts.get(k, [])
                content_artifacts[k] = items

        # filter domains
        keys = [consts.DOMAIN, consts.DF_DOMAIN]
        for k in keys:
            new_items = []
            _t = content_artifacts[k]
            new_items = [d for d in _t if not IOCREX.filter_domain(d)]
            content_artifacts[k] = new_items

        keys = [consts.URL, consts.DF_URL, consts.URL_POT, consts.DF_URL_POT]
        for k in keys:
            new_items = []
            urls = content_artifacts.get(k, [])
            _s = zip(urls, IOCREX.hosts_from_urls(urls, True))
            new_items = [u for u, d in _s if not IOCREX.filter_domain(d)]
            content_artifacts[k] = new_items

        return content_artifacts