def test_ipv4_extract(self): content_list = [ '127.0.0.1', '192.168.255.255', '1.1.1.1', '1[.]1[.]1[.]1', '1(.)1(.)1(.)1', '111[.]111[.]111[.]111', '111[.]111.111[.]111', '111[.111.]111[.111', '0.0.0.0', '100.100.100.100', '200.200.200.200', '200.201.210.209', '105.105.105.105', '250.250.250.250', '26.26.26.26', '255.255.255.255', ] for content in content_list: self.assertEqual( list(iocextract.extract_ipv4s(content))[0], content) self.assertEqual( list(iocextract.extract_ipv4s(_wrap_spaces(content)))[0], content) self.assertEqual( list(iocextract.extract_ipv4s(_wrap_tabs(content)))[0], content) self.assertEqual( list(iocextract.extract_ipv4s(_wrap_newlines(content)))[0], content) self.assertEqual( list(iocextract.extract_ipv4s(_wrap_words(content)))[0], content) self.assertEqual( list(iocextract.extract_ipv4s(_wrap_nonwords(content)))[0], content) invalid_list = [ '192.168.1', '192.168.a.1', '11111.1111.1111.1111', ] for content in invalid_list: self.assertEqual(len(list(iocextract.extract_ipv4s(content))), 0) self.assertEqual( len(list(iocextract.extract_ipv4s(_wrap_spaces(content)))), 0) self.assertEqual( len(list(iocextract.extract_ipv4s(_wrap_tabs(content)))), 0) self.assertEqual( len(list(iocextract.extract_ipv4s(_wrap_newlines(content)))), 0)
def extractIOC(path): extractor = URLExtract() try: out = execute_command('src\\strings64.exe ' + path) except: out = execute_command('src\\strings64.exe ' + path) out = out.decode("utf-8").split('\n') extract_url = [] ipv4 = [] ipv6 = [] emails = [] for url in iocextract.extract_urls(str(out), refang=True, strip=True): n = extractor.find_urls(url) try: n = n[0] n = str(n).replace("\\r", "") extract_url.append(n) except: pass extract_url = list(set(extract_url)) for ip4 in iocextract.extract_ipv4s(str(out), refang=True): ipv4.append(ip4) for ip6 in iocextract.extract_ipv6s(str(out)): ipv6.append(ip6) for email in iocextract.extract_emails(str(out), refang=True): emails.append(str(email).replace("\\r", "")) return (extract_url, ipv4, ipv6, emails)
def artifacts(self, raw): artifacts = [] ipv4s = list(iocextract.extract_ipv4s(str(raw))) if ipv4s: ipv4s = list(dict.fromkeys(ipv4s)) for i in ipv4s: artifacts.append(self.build_artifact('ip', str(i))) return artifacts
def test_refang_ipv4(self): content_list = [ '111.111.111.111', '111[.]111[.]111[.]111', '111(.)111(.)111(.)111', '111[.]111[.]111[.]111', '111[.]111.111[.]111', '111[.111.]111[.111', ] for content in content_list: self.assertEquals(list(iocextract.extract_ipv4s(content, refang=True))[0], '111.111.111.111') self.assertEquals(iocextract.refang_ipv4(content), '111.111.111.111')
def _utility_ioc_extractor_function(self, event, *args, **kwargs): results = {} results["was_successful"] = False try: # Get the function parameters: incident_id = kwargs.get("incident_id") # number text_string = kwargs.get("text_string") # text log = logging.getLogger(__name__) # Establish logging text_string = unicodedata.normalize( "NFKD", BeautifulSoup(text_string, "html.parser").get_text( ' ')) # Strip HTML and normalize text # Parse IOCs by type from text_string - OrderedDict.fromkeys() preserves order and removes duplicates. results["ipv4s"] = list( OrderedDict.fromkeys( list(iocextract.extract_ipv4s(text_string, refang=True)))) results["ipv6s"] = list( OrderedDict.fromkeys( list(iocextract.extract_ipv6s(text_string)))) results["urls"] = list( OrderedDict.fromkeys( list(iocextract.extract_urls( text_string, refang=True)))) # URLs and domains results["domains"] = list( OrderedDict.fromkeys([ urlparse(url).netloc for url in results["urls"] ])) # domains only results["email_addresses"] = list( OrderedDict.fromkeys( list(iocextract.extract_emails(text_string, refang=True)))) results["email_domains"] = list( OrderedDict.fromkeys([ email.split('@')[1] for email in results["email_addresses"] ])) # domains only results["md5_hashes"] = list( OrderedDict.fromkeys( list(iocextract.extract_md5_hashes(text_string)))) results["sha256_hashes"] = list( OrderedDict.fromkeys( list(iocextract.extract_sha256_hashes(text_string)))) results["was_successful"] = True # Produce a FunctionResult with the results yield FunctionResult(results) except Exception: yield FunctionError()
def main(inp, out): for line in inp.readlines(): for ip in iocextract.extract_ipv4s(line, refang=True): if ip not in common.IPs: common.IPs.append(ip) print(ip + ', ') else: print(ip + ' Already in List') out.write('\n#####IPs#####\n\n') for item in common.IPs: out.write('"' + item + '", \n')
def ioc_parse(line): """ Use library that can handle defanged formats for IOCs (Indicators of Compromise) """ params = [] formatted = line for url in iocextract.extract_urls(formatted, strip=True): refanged = iocextract.refang_url(url) param = get_ioc_param('url', url, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], url, formatted[param[1]:]) for ip in iocextract.extract_ipv4s(formatted): refanged = iocextract.refang_ipv4(ip) param = get_ioc_param('ip_address', ip, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], ip, formatted[param[1]:]) for ip in iocextract.extract_ipv6s(formatted): param = get_ioc_param('ip_address', ip, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], ip, formatted[param[1]:]) for email in iocextract.extract_emails(formatted): refanged = iocextract.refang_email(email) param = get_ioc_param('email', email, formatted) param.append(refanged) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], email, formatted[param[1]:]) for h in iocextract.extract_hashes(formatted): param = get_ioc_param('hash', h, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], h, formatted[param[1]:]) for rule in iocextract.extract_yara_rules(formatted): param = get_ioc_param('yara_rule', rule, formatted) params.append(param) formatted = '{}{}{}'.format(formatted[:param[0]], rule, formatted[param[1]:]) return formatted, params
def test_defang_dot(self): content = "192.168.0.1" combinations = [ ["\."], ["(.)", "(.", ".)"], ["[.]", "[.", ".]"], ] for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace(".", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_ipv4s(defanged_content, refang=True)) self.assertEqual(len(result), 1, "failed defang on: " + defang_style) self.assertEqual(result[0], content)
def test_defang_unsupported_comma(self): content = "192.168.0.1" combinations = [ ["(,(", "(,)", "),(", "),)", "(,", ",(", "),", ",)"], ["[,[", "[,]", "],[", "],]", "[,", ",[", "],", ",]"], ["{,{", "{,}", "},{", "},}", "{,", ",{", "},", ",}"], ] for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace(".", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_ipv4s(defanged_content, refang=True)) self.assertNotEqual( len(result), 1, "should fail on defanging style : " + defang_style)
def test_defang_unsupported_dot(self): content = "192.168.0.1" combinations = [ ["(.(", ").(", ").)", ".("], ["[.[", "].[", "].]", ".[", "]."], ["{.{", "{.}", "}.{", "}.}", "{.", ".{", "}.", ".}"], ] for substitution_type in combinations: for defang_style in substitution_type: defanged_content = content.replace(".", defang_style) #print("checking: " + defanged_content) result = list( iocextract.extract_ipv4s(defanged_content, refang=True)) self.assertNotEqual( len(result), 1, "should fail on defanging style : " + defang_style)
def CapeReporter(values): cape_val = [] for usrInput in values: chk_ip = list(iocextract.extract_ipv4s(usrInput)) chk_url = list(iocextract.extract_urls(usrInput)) chk_md5 = list(iocextract.extract_md5_hashes(usrInput)) chk_sha1 = list(iocextract.extract_sha1_hashes(usrInput)) chk_256 = list(iocextract.extract_sha256_hashes(usrInput)) if chk_url: usrInput = chk_url[0] argType = 'url' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) elif chk_ip: usrInput = chk_ip[0] argType = 'ip' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) elif chk_md5: usrInput = chk_md5[0] argType = 'md5' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) elif chk_sha1: usrInput = chk_sha1[0] argType = 'sha1' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) elif chk_256: usrInput = chk_256[0] argType = 'sha256' stream = allReport(usrInput, argType) for data in stream: cape_val.append({'Cape Sandbox': data}) else: pass return cape_val
def artifacts(self, raw): artifacts = [] urls = list(iocextract.extract_urls(str(raw))) ipv4s = list(iocextract.extract_ipv4s(str(raw))) mail_addresses = list(iocextract.extract_emails(str(raw))) hashes = list(iocextract.extract_hashes(str(raw))) if urls: for u in urls: artifacts.append(self.build_artifact('url',str(u))) if ipv4s: for i in ipv4s: artifacts.append(self.build_artifact('ip',str(i))) if mail_addresses: for e in mail_addresses: artifacts.append(self.build_artifact('mail',str(e))) if hashes: for h in hashes: artifacts.append(self.build_artifact('hash',str(h))) return artifacts
def extract_text_indicators(username, tweet_id, text): indicator_list = [] user_id = '@{0}'.format(username) tweet_url = 'https://twitter.com/{0}/status/{1}'.format(username, tweet_id) try: for ip in iocextract.extract_ipv4s(text, refang=True): if is_valid_ip(ip): indicator_list.append( TwitterIndicator(user_id, tweet_url, 'IPv4', ip)) for hash in iocextract.extract_hashes(text): hash_type = get_hash_type(hash) if hash_type: indicator_list.append( TwitterIndicator(user_id, tweet_url, hash_type, hash)) for url in iocextract.extract_urls(text, refang=True): if 'ghostbin.com' in url or 'pastebin.com' in url: paste_indicators = extract_paste_indicators(username, url) if len(paste_indicators) > 0: indicator_list.extend(paste_indicators) url = apply_url_fixes(url) if is_valid_url(url): indicator_list.append( TwitterIndicator(user_id, tweet_url, 'URL', url)) elif INCLUDE_DOMAINS: if is_valid_domain(url): indicator_list.append( TwitterIndicator(user_id, tweet_url, 'HOST', url)) except Exception as ex: LOGGER.warning('Exception parsing text: {0}'.format(ex)) return indicator_list
def artifacts(self, raw): if self.filename: return [ self.build_artifact("file", self.filename), ] else: artifacts = [] raw_str = str(raw) raw_str = raw_str.replace('\\"', '"') urls = set(iocextract.extract_urls(raw_str)) ipv4s = set(iocextract.extract_ipv4s(raw_str)) mail_addresses = set(iocextract.extract_emails(raw_str)) if urls: for u in urls: artifacts.append(self.build_artifact("url", str(u))) if ipv4s: for i in ipv4s: artifacts.append(self.build_artifact("ip", str(i))) if mail_addresses: for e in mail_addresses: artifacts.append(self.build_artifact("mail", str(e))) return artifacts
def test_ipv4(self): content = "192.168.0.1" result = list(iocextract.extract_ipv4s(content)) self.assertEqual(len(result), 1) self.assertEqual(result[0], content)
import fitz import iocextract from optparse import OptionParser parser = OptionParser(usage='usage: python extractor [-f] file.pdf') parser.add_option('-f', '--file', dest='filename', help='foo help') (options, args) = parser.parse_args() if not options.filename: # if filename is not given parser.error('Filename not given') doc = fitz.open(options.filename) iocs = [] for page in range(doc.pageCount): pageread = doc.loadPage(page) text = pageread.getText("text") for ipv4 in iocextract.extract_ipv4s(text): iocs.append(ipv4) for page in range(doc.pageCount): pageread = doc.loadPage(page) text = pageread.getText("text") for url in iocextract.extract_urls(text): iocs.append(url) iocs = list(dict.fromkeys(iocs)) for i in iocs: print(i)
def start(self): self.logging() # Extraction if self.extract_all is not None \ or self.ip is not None \ or self.hash is not None \ or self.domain is not None \ or self.extract_file is not None: self.logger.info( 'Checking the type of extraction will be performed.') if self.extract_file is not None: self.logger.info( f'Obtaining IOC from file: {self.extract_file}') if os.path.exists(self.extract_file): openfile = open(self.extract_file, 'r+') all_text = openfile.read() title = self.extract_file file_name = self.extract_file else: self.logger.error( 'The given directory or file was not found.') elif self.extract_url is not None: self.logger.info( f'Obtaining IOC from WebSite: {self.extract_url}') self.driver.get(self.extract_url) soup = BeautifulSoup(self.driver.page_source, "html.parser") title = soup.find('title').get_text() all_text = self.select_all_text(soup=soup) file_name = self.extract_url if self.extract_all: self.driver.get(self.baseurl) count = 0 for extract_iocs in iocextract.extract_iocs(all_text): if '/' not in extract_iocs \ and '[at]' not in extract_iocs: if len( self.database.compare_ioc( IOC=extract_iocs.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_iocs.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="IOCS", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_iocs.replace('[.]', '.'), count=count, name=extract_iocs.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_iocs}') elif self.domain: self.driver.get(self.baseurl) count = 0 for extract_urls in iocextract.extract_urls(all_text): if '/' not in extract_urls \ and '[at]' not in extract_urls: if len( self.database.compare_ioc( IOC=extract_urls.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_urls.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="Domain", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_urls.replace('[.]', '.'), count=count, name=extract_urls.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_urls}') elif self.ip: self.driver.get(self.baseurl) count = 0 for extract_ipv4s in iocextract.extract_ipv4s(all_text): if '/' not in extract_ipv4s \ and '[at]' not in extract_ipv4s: if len( self.database.compare_ioc( IOC=extract_ipv4s.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_ipv4s.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="ipv4", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_ipv4s.replace('[.]', '.'), count=count, name=extract_ipv4s.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_ipv4s}') elif self.hash: self.logger.info('Getting only the Hashes from the site.') self.driver.get(self.baseurl) count = 0 for extract_hashes in iocextract.extract_hashes(all_text): if '/' not in extract_hashes \ and '[at]' not in extract_hashes: if len( self.database.compare_ioc( IOC=extract_hashes.replace('[.]', '.'))) == 0: self.database.save_ioc(IOC=extract_hashes.replace( '[.]', '.'), signature=title, tags="Extract from URL", font="Extract", type="Hash", file_name=file_name) self.uploadIOC( comment=f'IOC extraction: {title}', IOC=extract_hashes.replace('[.]', '.'), count=count, name=extract_hashes.replace('[.]', '.')) count += 1 else: self.logger.debug( f'IOC already registered: {extract_hashes}') if self.feed is not None: # MalwareBaazar count = 0 for iocs in MalwareBaazar().start: if len(self.database.compare_ioc( IOC=iocs['sha256_hash'])) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['file_name'], signature=iocs['signature'], tags=iocs['tags'], font='Bazaar') self.database.save_ioc(file_name=iocs['file_name'], IOC=iocs['sha256_hash'], signature=iocs['signature'], tags=str(iocs['tags']).replace("'",'') \ .replace('[','') \ .replace(']',''), font='Bazaar', type="Hash") self.uploadIOC(comment=comment, IOC=iocs['sha256_hash'], count=count, name=iocs['file_name']) count += 1 else: self.logger.debug( f"IOC already registered: {iocs['sha256_hash']}") # Circl for feed in MISPFeed( url="https://www.circl.lu/doc/misp/feed-osint/").start: request = requests.get(feed, headers={ 'User-Agent': 'Mozilla/5.0' }).json() count = 0 for iocs in request['Event']['Attribute']: if iocs['category'] == 'Payload delivery': if '.' not in iocs['value'] \ and len(iocs['value']) == 32 \ or len(iocs['value']) == 64: if len(self.database.compare_ioc( IOC=iocs['value'])) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['comment'].split(' ')[0], signature=iocs['category'], tags=iocs['category'], font="Circl") self.database.save_ioc( file_name=iocs['comment'].split(' ')[0], IOC=iocs['value'], signature=iocs['category'], tags=iocs['category'], font="Circl", type="Hash") self.uploadIOC( comment=comment, IOC=iocs['value'], count=count, name=iocs['comment'].split(' ')[0]) count += 1 else: self.logger.debug( f"IOC already registered: {iocs['value']}") elif iocs['category'] == 'External analysis': if 'virustotal' in iocs['value']: hash = iocs['value'].split('/')[4] if len(self.database.compare_ioc(IOC=hash)) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['comment'].split(' ')[0], signature=iocs['category'], tags=iocs['category'], font="Circl") self.database.save_ioc( file_name=iocs['comment'].split(' ')[0], IOC=hash, signature=iocs['category'], tags=iocs['category'], font="Circl", type="Hash") self.uploadIOC( comment=comment, IOC=iocs['value'], count=count, name=iocs['comment'].split(' ')[0]) count += 1 else: self.logger.debug( f"IOC already registered: {iocs['value']}") elif iocs['category'] == 'Artifacts dropped': hash = iocs['value'] if len(self.database.compare_ioc(IOC=hash)) == 0: comment = "Name: {name}, signature: {signature}, tags: {tags}, font: {font}".format( name=iocs['comment'].split(' ')[0], signature=iocs['category'], tags=iocs['category'], font="Circl") self.database.save_ioc( file_name=iocs['comment'].split(' ')[0], IOC=hash, signature=iocs['category'], tags=iocs['category'], font="Circl", type="Hash") self.uploadIOC(comment=comment, IOC=iocs['value'], count=count, name=iocs['comment'].split(' ')[0]) count += 1 else: self.logger.debug( f'IOC already registered: {hash}')
for filename in os.listdir(path): if(count > maximum - 1): break if(filename in skip_files): continue # Extract text from pdf filepath = os.path.join(path, filename) content = convert_pdf_txt(filepath) # Extract Indicators of Compromise from text, recording time extracted_files[filename] = {} extract_start_time = time.time() extracted_files[filename]["urls"] = list(iocextract.extract_urls(content, refang=True)) extracted_files[filename]["email_addresses"] = list(iocextract.extract_emails(content, refang=True)) extracted_files[filename]["ipv4s"] = list(iocextract.extract_ipv4s(content, refang=True)) extracted_files[filename]["ipv6s"] = list(iocextract.extract_ipv6s(content)) extracted_files[filename]["md5s"] = list(iocextract.extract_md5_hashes(content)) extracted_files[filename]["sha1s"] = list(iocextract.extract_sha1_hashes(content)) extracted_files[filename]["sha256s"] = list(iocextract.extract_sha256_hashes(content)) extracted_files[filename]["sha512s"] = list(iocextract.extract_sha512_hashes(content)) extracted_files[filename]["yara"] = list(iocextract.extract_yara_rules(content)) extract_avg_numerator += time.time() - extract_start_time count += 1 process_end_time = time.time() # add some meta info on process run time extracted_files["meta"] = { "tool": "iocextract",