def test_ipextract(self): logging.debug("Performing HTML extract tests") self.assertTrue(IOCRegex.search(IP, DATA)) results = IOCRegex.extract_all_possible(DATA) self.assertTrue(all([i in IP_RESULTS for i in results[IP]])) self.assertTrue(all([i in DOMAIN_RESULTS for i in results[DOMAIN]])) self.assertTrue( all([i in DF_DOMAIN_RESULTS for i in results[DF_DOMAIN]]))
def test_ipextract(self): logging.debug( "Performing extract tests on things that failed and shouldnt") self.assertTrue(IOCRegex.search(URL_POT, DATA2)) w, dw = IOCRegex.extract_value(URL_POT, DATA2) self.assertTrue(len(w) == 0 and len(dw) == 1) self.assertTrue(dw[0] == IOCRegex.defang(DATA2)) results = IOCRegex.extract_all_possible(DATA2) self.assertTrue('badguys.com.tr' in results[DF_DOMAIN])
def test_ipextract(self): logging.debug("Performing IP extract tests") self.assertTrue(IOCRegex.search(IP, DATA)) results = IOCRegex.extract_all_possible(DATA) self.assertTrue(len(results[DF_IP]) == 1) self.assertTrue(len(results[IP]) == 1) ip = results[IP][0] self.assertTrue(ip == EXP_IP_VALUE) ip = results[DF_IP][0] self.assertTrue(ip == DF_EXP_IP_VALUE)
def extract_contents(self, content): # get some content from the message and replace \u2026 # DEBUG_CONTENT.append(content) # RE extract entities (hosts, domains, IPs, s), links replace_chars = ['''`~!@#$^&*()'"{}'''] new_content = content for c in replace_chars: new_content = new_content.replace(c, ' ') rexk = self.regex_keywords entities = IOCREX.extract_all_possible(new_content, addl_keywords=rexk) domains = entities[consts.DOMAIN] + entities[consts.DF_DOMAIN] hashes_s = [consts.MD5, consts.SHA1, consts.SHA256, consts.SHA512] for k in hashes_s: del entities[k] tags = [i.strip('#') for i in entities[consts.HASH_TAG]] entities[consts.HASH_TAG] = tags entities['processed_tags'] = self.hashtags_with_tags(tags) entities['site_rankings'] = self.lookup_sites_rank(domains) # us = self.expand_twitter_urls_extract_content(entities[consts.URL]) # entities['linked_content'] = us keys = [consts.DOMAIN, consts.DF_DOMAIN] for k in keys: new_items = [] _t = entities.get(k, []) new_items = [d for d in _t if not IOCREX.filter_domain(d)] entities[k] = new_items keys = [consts.URL, consts.DF_URL, consts.URL_POT, consts.DF_URL_POT] for k in keys: new_items = [] urls = entities.get(k, []) _s = zip(urls, IOCREX.hosts_from_urls(urls, True)) new_items = [u for u, d in _s if not IOCREX.filter_domain(d)] entities[k] = new_items edip = entities[consts.DF_DOMAIN] + entities[consts.DF_IP] adip = [] # us[consts.DF_DOMAIN] + us[consts.DF_IP] ht = entities[consts.HASH_TAG] # + us[consts.HASH_TAG] kw = entities[consts.KEYWORDS] # + us[consts.KEYWORDS] safe_hosts = [i.replace('.', '[.]') for i in edip + adip] entities[consts.HASH_TAG] = ht entities[consts.KEYWORDS] = kw entities['safe_hosts'] = safe_hosts good_message = IOCREX.is_good_result(entities) return good_message, entities
def extract_embedded_links(self): urls = set() el_r = {consts.LINKS: [], consts.DOMAINS: [], consts.IPS: []} for link in BeautifulSoup(self.content, 'html.parser', parse_only=SoupStrainer('a')): line = None if 'href' in link: line = link['attrs']['href'] if line is None or len(line) < 3: continue el = IOCREX.extract_link(line) el_r[consts.LINKS] = el_r[consts.LINKS] + el[consts.LINKS] el_r[consts.DF_LINKS] = el_r[consts.DF_LINKS] + el[consts.DF_LINKS] if len(el[consts.DF_LINKS]) > 0: for info in el[consts.DF_LINKS]: url = info[consts.URL] if url in urls: continue urls.add(url) hi = IOCREX.extract_domain_or_host(url) _t = el_r[consts.DF_IPS] + [i for i in hi[consts.DF_IPS]] el_r[consts.DF_IPS] = _t qs = [i for i in hi[consts.DF_DOMAINS]] el_r[consts.DF_DOMAINS] = el_r[consts.DF_DOMAINS] + qs x = el_r[consts.IPS] + [i for i in hi[consts.IPS]] el_r[consts.DF_IPS] = sorted(set(x)) y = el_r[consts.DOMAINS] + [i for i in hi[consts.DOMAINS]] el_r[consts.DF_DOMAINS] = sorted(set(y)) if len(el[consts.LINKS]) > 0: for info in el[consts.LINKS]: url = info[consts.URL] if url in urls: continue urls.add(url) hi = IOCREX.extract_domain_or_host(url) x = el_r[consts.IPS] + [i for i in hi[consts.IPS]] el_r[consts.IPS] = sorted(set(x)) y = el_r[consts.DOMAINS] + [i for i in hi[consts.DOMAINS]] el_r[consts.DOMAINS] = sorted(set(y)) return el_r
def __init__(self, link=None, content=None, look_at_embedded_links=True): if content is None and link is None: raise Exception("Provide either a link or content to analyze") self.link = link self.expanded_link = False self.orig_link = link self.content = content self.content_type = 'html' self.response = None self.bs4_parser = None if self.link is not None and self.content is None: # download the link self.response = requests.get(self.link) # self.response = requests.get(self.link, headers=consts.HEADERS()) self.content_type = self.response.headers['content-type'] # read the contents if self.response.status_code == 200: self.content = self.response.text self.link = self.response.request.url logging.debug("Expanded link to: %s" % self.link) self.expanded_link = self.orig_link != self.link else: _m = "Unable to get the specified content:" +\ " HTTP STATUS CODE = %d" raise Exception(_m % self.response.status_code) if self.content_type.find('html') > -1 or \ self.content_type.find('text/plain') > -1: self.bs4_parser = BeautifulSoup(self.content, 'html.parser') elif self.content_type.find('json'): # create key value mappings line by line json_data = json.loads(self.content) self.content = json.dumps(json_data, indent=0, sort_keys=True) self.bs4_parser = BeautifulSoup(self.content, 'html.parser') try: self.embedded_links = self.extract_embedded_links() except: self.embedded_links = [] try: self.artifacts = IOCREX.extract_all_possible(content) except: self.artifacts = {}
def expand_twitter_urls_extract_content(self, urls): expansions = [] content_artifacts = {'expansions': expansions} for k in ARTIFACT_KEYS: content_artifacts[k] = [] for url in urls: # Mongo does not like '.' in names so # I need to dodge that for the time being # was expansions = {}; expansions[url] = ... results = { 'url': url, 'failed': True, 'expanded_link': None, 'link': None, 'orig_link': None, 'content_type': None, 'artifacts': {}, } if not self.grab_linked_content: expansions.append(results) continue try: logging.debug("Attempting to download url: %s" % url) ch = CH(link=url) results['failed'] = False # results['content'] = ch.content results['expanded_link'] = ch.expanded_link results['link'] = ch.link results['orig_link'] = url results['content_type'] = ch.content_type results['artifacts'] = ch.artifacts except: _m = 'Failed with the following exception:\n{}' logging.debug(_m.format(traceback.format_exc())) expansions.append(results) for expansion in expansions: artifacts = expansion['artifacts'] for k in ARTIFACT_KEYS: items = content_artifacts[k] + artifacts.get(k, []) content_artifacts[k] = items # filter domains keys = [consts.DOMAIN, consts.DF_DOMAIN] for k in keys: new_items = [] _t = content_artifacts[k] new_items = [d for d in _t if not IOCREX.filter_domain(d)] content_artifacts[k] = new_items keys = [consts.URL, consts.DF_URL, consts.URL_POT, consts.DF_URL_POT] for k in keys: new_items = [] urls = content_artifacts.get(k, []) _s = zip(urls, IOCREX.hosts_from_urls(urls, True)) new_items = [u for u, d in _s if not IOCREX.filter_domain(d)] content_artifacts[k] = new_items return content_artifacts