def run(self): i = 0 while (True): i = i + 1 if i % 1000 == 0: time.sleep(10) url = self.r.rpop('crawl') fex = Faup() if url: print "url found: " + url fex.decode(url) domain = fex.get_host() entry = self.db.new_domaines.find_one({'domaine': domain}) if entry == None: print "record: " + domain self.db.new_domaines.save({ 'domaine': domain, 'urls': [url] }) urls_stored = entry['urls'] if not url in urls_stored: urls_stored.append(url) entry['urls'] = urls_stored self.db.new_domaines.save(entry)
def run(self): i = 0 while (True): i = i + 1 if i % 1000 == 0: time.sleep(10) self.lock.acquire() self.r.switchDB(1) url = self.r.rpop('crawl') self.lock.release() # print url fex = Faup() if url: print "url found: " + url try: fex.decode(url) domain = fex.get_host() entry = self.db.new_domaines.find_one({'domaine': domain}) if entry == None: print "record: " + domain self.db.new_domaines.save({'domaine': domain, 'urls': [url]}) urls_stored = entry['urls'] if not url in urls_stored: urls_stored.append(url) entry['urls'] = urls_stored self.db.new_domaines.save(entry) except: print "parsing fault " + url
def get_urls(url, depth=1): if depth > 5: print('Too many redirects.') return fex = Faup() def meta_redirect(content): c = content.lower() soup = BeautifulSoup(c, "html.parser") for result in soup.find_all(attrs={'http-equiv': 'refresh'}): if result: out = result["content"].split(";") if len(out) == 2: wait, text = out a, url = text.split('=', 1) return url.strip() return None resolve, reason = try_resolve(fex, url) if not resolve: # FIXME: inform that the domain does not resolve yield url return logging.debug("Making HTTP connection to " + url) headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0'} try: response = requests.get(url, allow_redirects=True, headers=headers, timeout=15, verify=False) except: # That one can fail (DNS for example) # FIXME: inform that the get failed yield url return if response.history is not None: for h in response.history: # Yeld the urls in the order we find them yield h.url yield response.url meta_redir_url = meta_redirect(response.content) if meta_redir_url is not None: depth += 1 if not meta_redir_url.startswith('http'): fex.decode(url) base = '{}://{}'.format(fex.get_scheme(), fex.get_host()) port = fex.get_port() if port is not None: base += ':{}'.format(port) if not meta_redir_url.startswith('/'): # relative redirect. resource_path has the initial '/' if fex.get_resource_path() is not None: base += fex.get_resource_path() if not base.endswith('/'): base += '/' meta_redir_url = base + meta_redir_url for url in get_urls(meta_redir_url, depth): yield url
def whois(server, port, domain, ignorelist, replacelist): cached = _cache_get(domain, 'whois') if cached is not None: return cached s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(15) try: s.connect((server, port)) except Exception: print("Connection problems - check WHOIS server") print(("WHOIS request while problem occurred: ", domain)) print(("WHOIS server: {}:{}".format(server, port))) sys.exit(0) if domain.startswith('http'): fex = Faup() fex.decode(domain) d = fex.get_domain().lower() else: d = domain s.send(d + "\r\n") response = '' while True: d = s.recv(4096) response += d if d == '': break s.close() match = re.findall(r'[\w\.-]+@[\w\.-]+', response) emails = process_emails(match, ignorelist, replacelist) if len(emails) == 0: return None list_mail = list(set(emails)) _cache_set(domain, list_mail, 'whois') return list_mail
def process(self): list_domains=self.db['new_domaines'].distinct('domaine') fex=Faup() for domain in list_domains: url='http://'+str(domain) fex.decode(url, False) print (fex.get_tld()+','+fex.get_domain()+','+','.join(fex.get_subdomain().split('.')[::-1]).replace('www','')).replace(',,',',')
def get_urls(url, depth=1): if depth > 5: print('Too many redirects.') return fex = Faup() def meta_redirect(content): c = content.lower() soup = BeautifulSoup(c) for result in soup.find_all(attrs={'http-equiv': 'refresh'}): if result: out = result["content"].split(";") if len(out) == 2: wait, text = out a, url = text.split('=', 1) return url.strip() return None resolve, reason = try_resolve(fex, url) if not resolve: # FIXME: inform that the domain does not resolve yield url return logging.debug("Making HTTP connection to " + url) headers = {'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0'} try: response = requests.get(url, allow_redirects=True, headers=headers, timeout=15, verify=False) except: # That one can fail (DNS for example) # FIXME: inform that the get failed yield url return if response.history is not None: for h in response.history: # Yeld the urls in the order we find them yield h.url yield response.url meta_redir_url = meta_redirect(response.content) if meta_redir_url is not None: depth += 1 if not meta_redir_url.startswith('http'): fex.decode(url) base = '{}://{}'.format(fex.get_scheme(), fex.get_host()) port = fex.get_port() if port is not None: base += ':{}'.format(port) if not meta_redir_url.startswith('/'): # relative redirect. resource_path has the initial '/' if fex.get_resource_path() is not None: base += fex.get_resource_path() if not base.endswith('/'): base += '/' meta_redir_url = base + meta_redir_url for url in get_urls(meta_redir_url, depth): yield url
def dns_resolve(url): cached = _cache_get(url, 'dns') if cached is not None: return cached fex = Faup() fex.decode(url) host = fex.get_host().lower() ipv4 = None ipv6 = None if is_ip(host): if ':' in host: try: socket.inet_pton(socket.AF_INET6, host) ipv6 = [host] except: pass else: try: socket.inet_aton(host) ipv4 = [host] except: pass else: try: ipv4 = [str(ip) for ip in dns.resolver.query(host, 'A')] except: logging.debug("No IPv4 address assigned to: " + host) try: ipv6 = [str(ip) for ip in dns.resolver.query(host, 'AAAA')] except: logging.debug("No IPv6 address assigned to: " + host) _cache_set(url, (ipv4, ipv6), 'dns') return ipv4, ipv6
def run(self): i = 0 while (True): i = i + 1 if i % 1000 == 0: time.sleep(10) self.lock.acquire() self.r.switchDB(1) url = self.r.rpop('crawl') self.lock.release() # print url fex = Faup() if url: print "url found: " + url try: fex.decode(url) domain = fex.get_host() entry = self.db.new_domaines.find_one({'domaine': domain}) if entry == None: print "record: " + domain self.db.new_domaines.save({ 'domaine': domain, 'urls': [url] }) urls_stored = entry['urls'] if not url in urls_stored: urls_stored.append(url) entry['urls'] = urls_stored self.db.new_domaines.save(entry) except: print "parsing fault " + url
def getmisp_urls(key, url, timeframe): response_domains = [] headers = { 'Authorization': '{}'.format(key), 'Content-type': 'application/json', 'Accept': 'application/json' } payload = '{ "returnFormat": "json", "type": "url", "last": "%s", "enforceWarninglist": true }' % timeframe response = requests.post(url, headers=headers, data=payload, verify=False) json_response = json.loads(response.text) fp = Faup() try: for attr in json_response['response']['Attribute']: url = attr['value'] eventid = attr['event_id'] if eventid not in ignore_eventid: category = attr['category'] timestamp = datetime.datetime.utcfromtimestamp( int(attr['timestamp'])).strftime('%Y-%m-%d') fp.decode(url) domain = fp.get_domain() if re.match(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", domain): response_domains.append({ 'domain': domain, 'eventid': eventid, 'category': category, 'timestamp': timestamp }) return response_domains except: return response_domains
def __post_init__(self): if self.domain is None: f = Faup( ) # Example code at https://programtalk.com/python-examples-amp/pyfaup.faup.Faup/ f.decode(self.address.split("@")[-1]) self.top_level_domain = f.get_tld() self.domain = f.get_domain() self.subdomain = f.get_subdomain()
class Urls(AbstractModule): """ Urls module for AIL framework """ def __init__(self): """ Init Urls """ super(Urls, self).__init__() self.faup = Faup() self.redis_cache_key = regex_helper.generate_redis_cache_key( self.module_name) # Protocol file path protocolsfile_path = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "protocolsfile")) # Get all uri from protocolsfile (Used for Curve) uri_scheme = "" with open(protocolsfile_path, 'r') as scheme_file: for scheme in scheme_file: uri_scheme += scheme[:-1] + "|" uri_scheme = uri_scheme[:-1] self.url_regex = "((?i:"+uri_scheme + \ ")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:[a-zA-Z]{2,15}))(?:\:[0-9]+)*(?:/?(?:[a-zA-Z0-9\.\,\?'\\+&%\$#\=~_\-]+))*)" # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized") def compute(self, message): """ Search for Web links from given message """ # Extract item id, score = message.split() item = Item(id) item_content = item.get_content() l_urls = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.url_regex, item.get_id(), item_content) for url in l_urls: self.faup.decode(url) unpack_url = self.faup.get() to_send = f"{url} {item.get_id()}" print(to_send) self.send_message_to_queue(to_send, 'Url') self.redis_logger.debug(f"url_parsed: {to_send}") if len(l_urls) > 0: to_print = f'Urls;{item.get_source()};{item.get_date()};{item.get_basename()};' self.redis_logger.info( f'{to_print}Detected {len(l_urls)} URL;{item.get_id()}')
def process(self): list_domains = self.db['new_domaines'].distinct('domaine') fex = Faup() for domain in list_domains: url = 'http://' + str(domain) fex.decode(url, False) print(fex.get_tld() + ',' + fex.get_domain() + ',' + ','.join(fex.get_subdomain().split('.')[::-1]).replace( 'www', '')).replace(',,', ',')
def __post_init__(self): f = Faup( ) # Example code at https://programtalk.com/python-examples-amp/pyfaup.faup.Faup/ f.decode(self.url) self.scheme = f.get_scheme() self.top_level_domain = f.get_tld() self.domain = f.get_domain() self.subdomain = f.get_subdomain() self.path = f.get_resource_path()
def harvesting_google(query, numberofpage): listreturn = [] result = Popen(['casperjs', 'CeleryWeb/casperjs/googlesearch.js', str(query), str(numberofpage)], stdout=PIPE) urls = result.stdout.readlines() for url in urls: f = Faup() url=url.replace('\n','') f.decode(url) listreturn.append(f.get()) return listreturn
class SQLInjectionDetection(AbstractModule): """docstring for SQLInjectionDetection module.""" # # TODO: IMPROVE ME # Reference: https://github.com/stamparm/maltrail/blob/master/core/settings.py SQLI_REGEX = r"information_schema|sysdatabases|sysusers|floor\(rand\(|ORDER BY \d+|\bUNION\s+(ALL\s+)?SELECT\b|\b(UPDATEXML|EXTRACTVALUE)\(|\bCASE[^\w]+WHEN.*THEN\b|\bWAITFOR[^\w]+DELAY\b|\bCONVERT\(|VARCHAR\(|\bCOUNT\(\*\)|\b(pg_)?sleep\(|\bSELECT\b.*\bFROM\b.*\b(WHERE|GROUP|ORDER)\b|\bSELECT \w+ FROM \w+|\b(AND|OR|SELECT)\b.*/\*.*\*/|/\*.*\*/.*\b(AND|OR|SELECT)\b|\b(AND|OR)[^\w]+\d+['\") ]?[=><]['\"( ]?\d+|ODBC;DRIVER|\bINTO\s+(OUT|DUMP)FILE" def __init__(self): super(SQLInjectionDetection, self).__init__() self.faup = Faup() config_loader = ConfigLoader() self.server_statistics = config_loader.get_redis_conn("ARDB_Statistics") self.redis_logger.info(f"Module: {self.module_name} Launched") def compute(self, message): url, id = message.split() if self.is_sql_injection(url): self.faup.decode(url) url_parsed = self.faup.get() item = Item(id) item_id = item.get_id() print(f"Detected SQL in URL: {item_id}") print(urllib.request.unquote(url)) to_print = f'SQLInjection;{item.get_source()};{item.get_date()};{item.get_basename()};Detected SQL in URL;{item_id}' self.redis_logger.warning(to_print) # Send to duplicate self.send_message_to_queue(item_id, 'Duplicate') # Tag msg = f'infoleak:automatic-detection="sql-injection";{item_id}' self.send_message_to_queue(msg, 'Tags') # statistics tld = url_parsed['tld'] if tld is not None: ## TODO: # FIXME: remove me try: tld = tld.decode() except: pass date = datetime.now().strftime("%Y%m") self.server_statistics.hincrby(f'SQLInjection_by_tld:{date}', tld, 1) # Try to detect if the url passed might be an sql injection by appliying the regex # defined above on it. def is_sql_injection(self, url_parsed): line = urllib.request.unquote(url_parsed) return re.search(SQLInjectionDetection.SQLI_REGEX, line, re.I) is not None
def run(self): i = 0 while True: i = i + 1 if i % 1000 == 0: time.sleep(10) url = self.r.rpop("crawl") fex = Faup() if url: fex.decode(url) domain = fex.get_host() entry = self.db.new_domaines.find_one({"domaine": domain}) if entry == None: print "record: " + domain self.db.new_domaines.save({"domaine": domain, "urls": [url]})
def is_valid_url(url): cached = _cache_get(url, 'valid') key = date.today().isoformat() + '_submissions' r_cache.zincrby(key, url) if cached is not None: return cached fex = Faup() if url.startswith('hxxp'): url = 'http' + url[4:] elif not url.startswith('http'): url = 'http://' + url logging.debug("Checking validity of URL: " + url) fex.decode(url) scheme = fex.get_scheme() host = fex.get_host() if scheme is None or host is None: reason = "Not a valid http/https URL/URI" return False, url, reason _cache_set(url, (True, url, None), 'valid') return True, url, None
def dns_resolve(url): cached = _cache_get(url, 'dns') if cached is not None: return cached fex = Faup() fex.decode(url) host = fex.get_host().lower() ipv4 = None ipv6 = None if not is_ip(host): try: ipv4 = [str(ip) for ip in dns.resolver.query(host, 'A')] except: logging.debug("No IPv4 address assigned to: " + host) try: ipv6 = [str(ip) for ip in dns.resolver.query(host, 'AAAA')] except: logging.debug("No IPv6 address assigned to: " + host) _cache_set(url, (ipv4, ipv6), 'dns') return ipv4, ipv6
def run(self): i=0 while(True): i=i+1 if i % 1000==0: time.sleep(10) url=self.r.rpop('crawl') fex=Faup() if url: print "url found: "+url fex.decode(url) domain=fex.get_host() entry = self.db.new_domaines.find_one({'domaine':domain}) if entry== None: print "record: "+ domain self.db.new_domaines.save({'domaine':domain,'urls':[url]}) urls_stored = entry['urls'] if not url in urls_stored: urls_stored.append(url) entry['urls']=urls_stored self.db.new_domaines.save(entry)
def sort(self, elem_links, url): fex = Faup() f = Filters() f.load() self.r.switchDB(1) extend = True domainfilter = True schemefilter = True try: for link in elem_links: new_url = link self.r.switchDB(2) if not self.r.get(new_url) and new_url: self.r.switchDB(1) if not self.r.get(new_url): fex.decode(new_url) domain = fex.get_host() if f.isfilteredscheme(fex.get_scheme()): self.r.switchDB(2) self.r.put(new_url, new_url) schemefilter = False if f.isfiltereddomains(domain): self.r.switchDB(2) self.r.put(new_url, new_url) domainfilter = False if f.isfilteredextention(fex.get_resource_path()): extend = False self.r.switchDB(2) self.r.put(new_url, new_url) if extend and domainfilter and schemefilter: self.r.switchDB(1) self.r.rpush('crawl', new_url) self.queue.append(new_url) except TypeError as e: print "TypeError"
while True: if message is not None: filename, score = message.split() if prec_filename is None or filename != prec_filename: domains_list = [] PST = Paste.Paste(filename) client = ip2asn() for x in PST.get_regex(url_regex): matching_url = re.search(url_regex, PST.get_p_content()) url = matching_url.group(0) to_send = "{} {} {}".format(url, PST._get_p_date(), filename) p.populate_set_out(to_send, 'Url') faup.decode(url) domain = faup.get_domain() subdomain = faup.get_subdomain() f1 = None domains_list.append(domain) publisher.debug('{} Published'.format(url)) if f1 == "onion": print domain hostl = unicode(avoidNone(subdomain)+avoidNone(domain)) try: socket.setdefaulttimeout(1) ip = socket.gethostbyname(unicode(hostl))
# The following needs fixes for ExpandedPyMisp for attribs in res_search['response']['Attribute']: uuid = attribs['uuid'] if uuid is not None: print("URL is already present.") # add sighting # if MISP allows to sight on add, we should implement it here, too misp.sighting(uuid=uuid, source="URLabuse") sys.exit(0) # This is obsolete #event = misp.get(misp_id) #existing_event = MISPEvent() #existing_event.load(event) redirect_count = 0 fex = Faup() fex.decode(url) hostname = fex.get_host().lower() screenshot = hostname.decode() + '.png' mispObject = MISPObject('phishing') mispObject.add_attribute('hostname', value=hostname.decode()) for key in response['result']: u = list(key.keys())[0] if redirect_count == 0: comment = "initial URL" mispObject.add_attribute('url', value=u, comment=comment) else: comment = "redirect URL: {}" mispObject.add_attribute('url-redirect', value=u, comment=comment.format(redirect_count)) redirect_count += 1
class WebStats(AbstractModule): """ WebStats module for AIL framework """ # Config Var THRESHOLD_TOTAL_SUM = 200 # Above this value, a keyword is eligible for a progression THRESHOLD_INCREASE = 1.0 # The percentage representing the keyword occurence since num_day_to_look MAX_SET_CARDINALITY = 10 # The cardinality of the progression set NUM_DAY_TO_LOOK = 5 # the detection of the progression start num_day_to_look in the past def __init__(self): super(WebStats, self).__init__() # Send module state to logs self.redis_logger.info("Module %s initialized" % (self.module_name)) # Sent to the logging a description of the module self.redis_logger.info("Makes statistics about valid URL") self.pending_seconds = 5 * 60 # REDIS # self.r_serv_trend = redis.StrictRedis( host=self.process.config.get("ARDB_Trending", "host"), port=self.process.config.get("ARDB_Trending", "port"), db=self.process.config.get("ARDB_Trending", "db"), decode_responses=True) # FILE CURVE SECTION # self.csv_path_proto = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "protocolstrending_csv")) self.protocolsfile_path = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "protocolsfile")) self.csv_path_tld = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "tldstrending_csv")) self.tldsfile_path = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "tldsfile")) self.csv_path_domain = os.path.join( os.environ['AIL_HOME'], self.process.config.get("Directories", "domainstrending_csv")) self.faup = Faup() self.generate_new_graph = False def computeNone(self): if self.generate_new_graph: self.generate_new_graph = False today = datetime.date.today() year = today.year month = today.month self.redis_logger.debug('Building protocol graph') lib_words.create_curve_with_word_file(self.r_serv_trend, self.csv_path_proto, self.protocolsfile_path, year, month) self.redis_logger.debug('Building tld graph') lib_words.create_curve_with_word_file(self.r_serv_trend, self.csv_path_tld, self.tldsfile_path, year, month) self.redis_logger.debug('Building domain graph') lib_words.create_curve_from_redis_set(self.r_serv_trend, self.csv_path_domain, "domain", year, month) self.redis_logger.debug('end building') def compute(self, message): self.generate_new_graph = True # Do something with the message from the queue url, date, path = message.split() self.faup.decode(url) url_parsed = self.faup.get() # Scheme analysis self.analyse('scheme', date, url_parsed) # Tld analysis self.analyse('tld', date, url_parsed) # Domain analysis self.analyse('domain', date, url_parsed) self.compute_progression('scheme', self.NUM_DAY_TO_LOOK, url_parsed) self.compute_progression('tld', self.NUM_DAY_TO_LOOK, url_parsed) self.compute_progression('domain', self.NUM_DAY_TO_LOOK, url_parsed) def analyse(self, field_name, date, url_parsed): field = url_parsed[field_name] if field is not None: try: # faup version field = field.decode() except: pass self.r_serv_trend.hincrby(field, date, 1) if field_name == "domain": #save domain in a set for the monthly plot domain_set_name = "domain_set_" + date[0:6] self.r_serv_trend.sadd(domain_set_name, field) self.redis_logger.debug("added in " + domain_set_name + ": " + field) def get_date_range(self, num_day): curr_date = datetime.date.today() date = Date( str(curr_date.year) + str(curr_date.month).zfill(2) + str(curr_date.day).zfill(2)) date_list = [] for i in range(0, num_day + 1): date_list.append(date.substract_day(i)) return date_list def compute_progression_word(self, num_day, keyword): """ Compute the progression for one keyword """ date_range = self.get_date_range(num_day) # check if this keyword is eligible for progression keyword_total_sum = 0 value_list = [] for date in date_range: # get value up to date_range curr_value = self.r_serv_trend.hget(keyword, date) value_list.append(int(curr_value if curr_value is not None else 0)) keyword_total_sum += int( curr_value) if curr_value is not None else 0 oldest_value = value_list[ -1] if value_list[-1] != 0 else 1 #Avoid zero division # The progression is based on the ratio: value[i] / value[i-1] keyword_increase = 0 value_list_reversed = value_list[:] value_list_reversed.reverse() for i in range(1, len(value_list_reversed)): divisor = value_list_reversed[ i - 1] if value_list_reversed[i - 1] != 0 else 1 keyword_increase += value_list_reversed[i] / divisor return (keyword_increase, keyword_total_sum) def compute_progression(self, field_name, num_day, url_parsed): """ recompute the set top_progression zset - Compute the current field progression - re-compute the current progression for each first 2*self.MAX_SET_CARDINALITY fields in the top_progression_zset """ redis_progression_name_set = "z_top_progression_" + field_name keyword = url_parsed[field_name] if keyword is not None: #compute the progression of the current word keyword_increase, keyword_total_sum = self.compute_progression_word( num_day, keyword) #re-compute the progression of 2*self.MAX_SET_CARDINALITY current_top = self.r_serv_trend.zrevrangebyscore( redis_progression_name_set, '+inf', '-inf', withscores=True, start=0, num=2 * self.MAX_SET_CARDINALITY) for word, value in current_top: word_inc, word_tot_sum = self.compute_progression_word( num_day, word) self.r_serv_trend.zrem(redis_progression_name_set, word) if (word_tot_sum > self.THRESHOLD_TOTAL_SUM) and ( word_inc > self.THRESHOLD_INCREASE): self.r_serv_trend.zadd(redis_progression_name_set, float(word_inc), word) # filter before adding if (keyword_total_sum > self.THRESHOLD_TOTAL_SUM) and ( keyword_increase > self.THRESHOLD_INCREASE): self.r_serv_trend.zadd(redis_progression_name_set, float(keyword_increase), keyword)
class UrlsExtractor(object): def __init__(self): self._url_regex = re.compile( r'((?:(?:ht|f)tp(?:s?)\:\/\/)' r'(?:[!#$&-;=?-\[\]_a-z~]|%[0-9a-f]{2})+)', re.I) self._faup = Faup() def extract(self, text): """This function extract all url http(s) and ftp(s) from text. Return a dict, with a key for every second-level domain and value a list of disassembled urls (output Faup tool). Example disassembled url https://drive.google.com/drive/my-drive: { 'domain': 'google.com', 'domain_without_tld': 'google', 'fragment': None, 'host': 'drive.google.com', 'port': None, 'query_string': None, 'resource_path': '/drive/my-drive', 'scheme': 'https', 'subdomain': 'drive', 'tld': 'com', 'url': 'https://drive.google.com/drive/my-drive' } """ if not isinstance(text, unicode): raise NotUnicodeError("The given text is not in unicode") self._results = dict() for i in self._url_regex.finditer(text): try: """ import urlnorm url = urlnorm.norm(i.group(0).strip()) Can't use urlnorm because can't manage domain like http://contentsr,xn--90afavbplfx2a6a5b2a,xn--p1ai/ After norm it's impossible tokenize this kind of urls """ url = i.group(0).strip() except: raise FailedRegexUrl("Failed parsing regex urls") try: self._faup.decode(url) tokens = self._faup.get() # Get results for domain domain = self._results.get(tokens['domain'], None) if domain: domain.append(tokens) else: self._results[tokens['domain']] = [tokens] except: raise FailedFaupParsing("Failed tokenize url with Faup") @property def urls_obj(self): return self._results @property def urls_json(self): try: return json.dumps(self.urls_obj, ensure_ascii=False) except: raise FailedReturnJsonUrls("Failed make JSON from urls result")
#Send to BrowseWarningPaste p.populate_set_out('credential;{}'.format(filepath), 'BrowseWarningPaste') #Put in form, count occurences, then send to moduleStats creds_sites = {} site_occurence = re.findall(regex_site_for_stats, content) for site in site_occurence: site_domain = site[1:-1] if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1 for url in sites: faup.decode(url) domain = faup.get()['domain'] if domain in creds_sites.keys(): creds_sites[domain] += 1 else: creds_sites[domain] = 1 for site, num in creds_sites.iteritems( ): # Send for each different site to moduleStats print 'credential;{};{};{}'.format(num, site, paste.p_date) p.populate_set_out( 'credential;{};{};{}'.format(num, site, paste.p_date), 'ModuleStats') if sites_set: print("=======> Probably on : {}".format(', '.join(sites_set)))
class Mail2MISP(): def __init__(self, misp_url, misp_key, verifycert, config, offline=False, urlsonly=False): self.offline = offline if not self.offline: self.misp = ExpandedPyMISP(misp_url, misp_key, verifycert, debug=config.debug) self.config = config self.urlsonly = urlsonly if not hasattr(self.config, 'enable_dns'): setattr(self.config, 'enable_dns', True) if self.urlsonly is False: setattr(self.config, 'enable_dns', False) self.debug = self.config.debug self.config_from_email_body = {} # Init Faup self.f = Faup() self.sightings_to_add = [] def load_email(self, pseudofile): self.pseudofile = pseudofile self.original_mail = message_from_bytes(self.pseudofile.getvalue(), policy=policy.default) self.subject = self.original_mail.get('Subject') try: self.sender = self.original_mail.get('From') except: self.sender = "<unknown sender>" # Remove words from subject for removeword in self.config.removelist: self.subject = re.sub(removeword, "", self.subject).strip() # Initialize the MISP event self.misp_event = MISPEvent() self.misp_event.info = f'{self.config.email_subject_prefix} - {self.subject}' self.misp_event.distribution = self.config.default_distribution self.misp_event.threat_level_id = self.config.default_threat_level self.misp_event.analysis = self.config.default_analysis def sighting(self, value, source): if self.offline: raise Exception('The script is running in offline mode, ') '''Add a sighting''' s = MISPSighting() s.from_dict(value=value, source=source) self.misp.add_sighting(s) def _find_inline_forward(self): '''Does the body contains a forwarded email?''' for identifier in self.config.forward_identifiers: if identifier in self.clean_email_body: self.clean_email_body, fw_email = self.clean_email_body.split( identifier) return self.forwarded_email( pseudofile=BytesIO(fw_email.encode())) def _find_attached_forward(self): forwarded_emails = [] for attachment in self.original_mail.iter_attachments(): attachment_content = attachment.get_content() # Search for email forwarded as attachment # I could have more than one, attaching everything. if isinstance(attachment_content, message.EmailMessage): forwarded_emails.append( self.forwarded_email( pseudofile=BytesIO(attachment_content.as_bytes()))) else: if isinstance(attachment_content, str): attachment_content = attachment_content.encode() filename = attachment.get_filename() if not filename: filename = 'missing_filename' if self.config_from_email_body.get( 'attachment' ) == self.config.m2m_benign_attachment_keyword: # Attach sane file self.misp_event.add_attribute( 'attachment', value=filename, data=BytesIO(attachment_content)) else: f_object, main_object, sections = make_binary_objects( pseudofile=BytesIO(attachment_content), filename=filename, standalone=False) self.misp_event.add_object(f_object) if main_object: self.misp_event.add_object(main_object) [ self.misp_event.add_object(section) for section in sections ] return forwarded_emails def email_from_spamtrap(self): '''The email comes from a spamtrap and should be attached as-is.''' raw_body = self.original_mail.get_body(preferencelist=('html', 'plain')) if raw_body: self.clean_email_body = html.unescape( raw_body.get_payload(decode=True).decode( 'utf8', 'surrogateescape')) else: self.clean_email_body = '' return self.forwarded_email(self.pseudofile) def forwarded_email(self, pseudofile: BytesIO): '''Extracts all possible indicators out of an email and create a MISP event out of it. * Gets all relevant Headers * Attach the body * Create MISP file objects (uses lief if possible) * Set all references ''' email_object = EMailObject(pseudofile=pseudofile, attach_original_mail=True, standalone=False) if email_object.attachments: # Create file objects for the attachments for attachment_name, attachment in email_object.attachments: if not attachment_name: attachment_name = 'NameMissing.txt' if self.config_from_email_body.get( 'attachment' ) == self.config.m2m_benign_attachment_keyword: a = self.misp_event.add_attribute('attachment', value=attachment_name, data=attachment) email_object.add_reference(a.uuid, 'related-to', 'Email attachment') else: f_object, main_object, sections = make_binary_objects( pseudofile=attachment, filename=attachment_name, standalone=False) if self.config.vt_key: try: vt_object = VTReportObject( self.config.vt_key, f_object.get_attributes_by_relation( 'sha256')[0].value, standalone=False) self.misp_event.add_object(vt_object) f_object.add_reference(vt_object.uuid, 'analysed-with') except InvalidMISPObject as e: print(e) pass self.misp_event.add_object(f_object) if main_object: self.misp_event.add_object(main_object) for section in sections: self.misp_event.add_object(section) email_object.add_reference(f_object.uuid, 'related-to', 'Email attachment') self.process_body_iocs(email_object) if self.config.spamtrap or self.config.attach_original_mail or self.config_from_email_body.get( 'attach_original_mail'): self.misp_event.add_object(email_object) return email_object def process_email_body(self): mail_as_bytes = self.original_mail.get_body( preferencelist=('html', 'plain')).get_payload(decode=True) if mail_as_bytes: self.clean_email_body = html.unescape( mail_as_bytes.decode('utf8', 'surrogateescape')) # Check if there are config lines in the body & convert them to a python dictionary: # <config.body_config_prefix>:<key>:<value> => {<key>: <value>} self.config_from_email_body = { k.strip(): v.strip() for k, v in re.findall( f'{self.config.body_config_prefix}:(.*):(.*)', self.clean_email_body) } if self.config_from_email_body: # ... remove the config lines from the body self.clean_email_body = re.sub( rf'^{self.config.body_config_prefix}.*\n?', '', html.unescape( self.original_mail.get_body( preferencelist=('html', 'plain')).get_payload( decode=True).decode('utf8', 'surrogateescape')), flags=re.MULTILINE) # Check if autopublish key is present and valid if self.config_from_email_body.get( 'm2mkey') == self.config.m2m_key: if self.config_from_email_body.get('distribution') is not None: self.misp_event.distribution = self.config_from_email_body.get( 'distribution') if self.config_from_email_body.get('threat_level') is not None: self.misp_event.threat_level_id = self.config_from_email_body.get( 'threat_level') if self.config_from_email_body.get('analysis') is not None: self.misp_event.analysis = self.config_from_email_body.get( 'analysis') if self.config_from_email_body.get('publish'): self.misp_event.publish() self._find_inline_forward() else: self.clean_email_body = '' self._find_attached_forward() def process_body_iocs(self, email_object=None): if email_object: body = html.unescape( email_object.email.get_body( preferencelist=('html', 'plain')).get_payload(decode=True).decode( 'utf8', 'surrogateescape')) else: body = self.clean_email_body # Cleanup body content # Depending on the source of the mail, there is some cleanup to do. Ignore lines in body of message for ignoreline in self.config.ignorelist: body = re.sub(rf'^{ignoreline}.*\n?', '', body, flags=re.MULTILINE) # Remove everything after the stopword from the body body = body.split(self.config.stopword, 1)[0] # Add tags to the event if keywords are found in the mail for tag in self.config.tlptags: for alternativetag in self.config.tlptags[tag]: if alternativetag in body.lower(): self.misp_event.add_tag(tag) # Prepare extraction of IOCs # Refang email data body = refang(body) # Extract and add hashes contains_hash = False for h in set(re.findall(hashmarker.MD5_REGEX, body)): contains_hash = True attribute = self.misp_event.add_attribute( 'md5', h, enforceWarninglist=self.config.enforcewarninglist) if email_object: email_object.add_reference(attribute.uuid, 'contains') if self.config.sighting: self.sightings_to_add.append((h, self.config.sighting_source)) for h in set(re.findall(hashmarker.SHA1_REGEX, body)): contains_hash = True attribute = self.misp_event.add_attribute( 'sha1', h, enforceWarninglist=self.config.enforcewarninglist) if email_object: email_object.add_reference(attribute.uuid, 'contains') if self.config.sighting: self.sightings_to_add.append((h, self.config.sighting_source)) for h in set(re.findall(hashmarker.SHA256_REGEX, body)): contains_hash = True attribute = self.misp_event.add_attribute( 'sha256', h, enforceWarninglist=self.config.enforcewarninglist) if email_object: email_object.add_reference(attribute.uuid, 'contains') if self.config.sighting: self.sightings_to_add.append((h, self.config.sighting_source)) if contains_hash: [ self.misp_event.add_tag(tag) for tag in self.config.hash_only_tags ] # # Extract network IOCs urllist = [] urllist += re.findall(urlmarker.WEB_URL_REGEX, body) urllist += re.findall(urlmarker.IP_REGEX, body) if self.debug: syslog.syslog(str(urllist)) hostname_processed = [] # Add IOCs and expanded information to MISP for entry in set(urllist): ids_flag = True self.f.decode(entry) domainname = self.f.get_domain() if domainname in self.config.excludelist: # Ignore the entry continue hostname = self.f.get_host() scheme = self.f.get_scheme() if scheme: scheme = scheme resource_path = self.f.get_resource_path() if resource_path: resource_path = resource_path if self.debug: syslog.syslog(domainname) if domainname in self.config.internallist and self.urlsonly is False: # Add link to internal reference unless in urlsonly mode attribute = self.misp_event.add_attribute( 'link', entry, category='Internal reference', to_ids=False, enforceWarninglist=False) if email_object: email_object.add_reference(attribute.uuid, 'contains') elif domainname in self.config.externallist or self.urlsonly is False: # External analysis attribute = self.misp_event.add_attribute( 'link', entry, category='External analysis', to_ids=False, enforceWarninglist=False) if email_object: email_object.add_reference(attribute.uuid, 'contains') elif domainname in self.config.externallist or self.urlsonly: # External analysis if self.urlsonly: comment = self.subject + " (from: " + self.sender + ")" else: comment = "" attribute = self.misp.add_attribute( self.urlsonly, { "type": 'link', "value": entry, "category": 'External analysis', "to_ids": False, "comment": comment }) for tag in self.config.tlptags: for alternativetag in self.config.tlptags[tag]: if alternativetag in self.subject.lower(): self.misp.tag(attribute["uuid"], tag) new_subject = comment.replace(alternativetag, '') self.misp.change_comment(attribute["uuid"], new_subject) else: # The URL is probably an indicator. comment = "" if (domainname in self.config.noidsflaglist) or ( hostname in self.config.noidsflaglist): ids_flag = False comment = "Known host (mostly for connectivity test or IP lookup)" if self.debug: syslog.syslog(str(entry)) if scheme: if is_ip(hostname): attribute = self.misp_event.add_attribute( 'url', entry, to_ids=False, enforceWarninglist=self.config.enforcewarninglist) if email_object: email_object.add_reference(attribute.uuid, 'contains') else: if resource_path: # URL has path, ignore warning list attribute = self.misp_event.add_attribute( 'url', entry, to_ids=ids_flag, enforceWarninglist=False, comment=comment) if email_object: email_object.add_reference( attribute.uuid, 'contains') else: # URL has no path attribute = self.misp_event.add_attribute( 'url', entry, to_ids=ids_flag, enforceWarninglist=self.config. enforcewarninglist, comment=comment) if email_object: email_object.add_reference( attribute.uuid, 'contains') if self.config.sighting: self.sightings_to_add.append( (entry, self.config.sighting_source)) if hostname in hostname_processed: # Hostname already processed. continue hostname_processed.append(hostname) if self.config.sighting: self.sightings_to_add.append( (hostname, self.config.sighting_source)) if self.debug: syslog.syslog(hostname) comment = '' port = self.f.get_port() if port: port = port comment = f'on port: {port}' if is_ip(hostname): attribute = self.misp_event.add_attribute( 'ip-dst', hostname, to_ids=ids_flag, enforceWarninglist=self.config.enforcewarninglist, comment=comment) if email_object: email_object.add_reference(attribute.uuid, 'contains') else: related_ips = [] if HAS_DNS and self.config.enable_dns: try: syslog.syslog(hostname) for rdata in dns.resolver.query(hostname, 'A'): if self.debug: syslog.syslog(str(rdata)) related_ips.append(rdata.to_text()) except Exception as e: if self.debug: syslog.syslog(str(e)) if related_ips: hip = MISPObject(name='ip-port') hip.add_attribute( 'hostname', value=hostname, to_ids=ids_flag, enforceWarninglist=self.config.enforcewarninglist, comment=comment) for ip in set(related_ips): hip.add_attribute('ip', type='ip-dst', value=ip, to_ids=False, enforceWarninglist=self.config. enforcewarninglist) self.misp_event.add_object(hip) if email_object: email_object.add_reference(hip.uuid, 'contains') else: if self.urlsonly is False: attribute = self.misp_event.add_attribute( 'hostname', value=hostname, to_ids=ids_flag, enforceWarninglist=self.config. enforcewarninglist, comment=comment) if email_object: email_object.add_reference(attribute.uuid, 'contains') def add_event(self): '''Add event on the remote MISP instance.''' # Add additional tags depending on others tags = [] for tag in [t.name for t in self.misp_event.tags]: if self.config.dependingtags.get(tag): tags += self.config.dependingtags.get(tag) # Add additional tags according to configuration for malware in self.config.malwaretags: if malware.lower() in self.subject.lower(): tags += self.config.malwaretags.get(malware) if tags: [self.misp_event.add_tag(tag) for tag in tags] has_tlp_tag = False for tag in [t.name for t in self.misp_event.tags]: if tag.lower().startswith('tlp'): has_tlp_tag = True if not has_tlp_tag: self.misp_event.add_tag(self.config.tlptag_default) if self.offline: return self.misp_event.to_json() event = self.misp.add_event(self.misp_event, pythonify=True) if self.config.sighting: for value, source in self.sightings_to_add: self.sighting(value, source) return event
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import pprint from pyfaup.faup import Faup f = Faup() f.decode("www.météo.fr") pprint.pprint(f.get())
#!/usr/bin/python from pyfaup.faup import Faup import sys import codecs import binascii f = Faup() file_urls=codecs.open(sys.argv[1],'r','ascii',errors='ignore') urls=file_urls.readlines() for url in urls: url=url.replace('\n','') #print("We decode the url: %s" % (url)) #if sys.version.split('.')[0].split('.')[0]=='3': f.decode(bytes(url,'utf-8'), False) #if sys.version.split('.')[0].split('.')[0]=='2': # f.decode(bytes(url),False) #data = f.get() f.get_tld() #f.get_domain() #f.get_subdomain() #print(f.get_tld()) #print(f.get_domain()) #print("URL TLD: %s" % (f.get_tld()))
url_arg = sys.argv[1] if urls_file is None: source_info = "arg:%s" % (sys.argv[1]) else: source_info = "file:%s" % (sys.argv[1]) urlw_log = UrlwLog(source_info) urlw_log.open() urlw_log.custom_log("Starting...") urlw_p = UrlwPlugins(urlw_log) fauplib = Faup() if source_info.startswith("arg:"): fauplib.decode(sys.argv[1]) faup_object = fauplib.get() for plugin in urlw_p.plugins_list: urlw_p.run(plugin, sys.argv[1], faup_object) elif source_info.startswith("file:"): urls = urls_file.readlines() for url in urls: fauplib.decode(url) faup_object = fauplib.get() for plugin in urlw_p.plugins_list: urlw_p.run(plugin, url, faup_object) urls_file.close() urlw_log.custom_log("Done")
def get_port(self): f = Faup() f.decode(self.url) return f.get_port()
class Query(): def __init__(self, loglevel: int = logging.DEBUG): self.__init_logger(loglevel) self.fex = Faup() self.cache = Redis(unix_socket_path=get_socket_path('cache'), db=1, decode_responses=True) def __init_logger(self, loglevel) -> None: self.logger = logging.getLogger(f'{self.__class__.__name__}') self.logger.setLevel(loglevel) def _cache_set(self, key, value, field=None): if field is None: self.cache.setex(key, json.dumps(value), 3600) else: self.cache.hset(key, field, json.dumps(value)) self.cache.expire(key, 3600) def _cache_get(self, key, field=None): if field is None: value_json = self.cache.get(key) else: value_json = self.cache.hget(key, field) if value_json is not None: return json.loads(value_json) return None def to_bool(self, s): """ Converts the given string to a boolean. """ return s.lower() in ('1', 'true', 'yes', 'on') def get_submissions(self, url, day=None): if day is None: day = date.today().isoformat() else: day = day.isoformat() return self.cache.zscore(f'{day}_submissions', url) def get_mail_sent(self, url, day=None): if day is None: day = date.today().isoformat() else: day = day.isoformat() self.fex.decode(url) host = self.fex.get_host() return self.cache.sismember(f'{day}_mails', host) def set_mail_sent(self, url, day=None): if day is None: day = date.today().isoformat() else: day = day.isoformat() self.fex.decode(url) host = self.fex.get_host() return self.cache.sadd(f'{day}_mails', host) def is_valid_url(self, url): cached = self._cache_get(url, 'valid') key = f'{date.today().isoformat()}_submissions' self.cache.zincrby(key, 1, url) if cached is not None: return cached if url.startswith('hxxp'): url = 'http' + url[4:] elif not url.startswith('http'): url = 'http://' + url logging.debug("Checking validity of URL: " + url) self.fex.decode(url) scheme = self.fex.get_scheme() host = self.fex.get_host() if scheme is None or host is None: reason = "Not a valid http/https URL/URI" return False, url, reason self._cache_set(url, (True, url, None), 'valid') return True, url, None def is_ip(self, host): try: ipaddress.ip_address(host) return True except ValueError: return False def try_resolve(self, url): self.fex.decode(url) host = self.fex.get_host().lower() if self.is_ip(host): return True, None try: ipaddr = dns.resolver.query(host, 'A') except Exception: reason = "DNS server problem. Check resolver settings." return False, reason if not ipaddr: reason = "Host " + host + " does not exist." return False, reason return True, None def get_urls(self, url, depth=1): if depth > 5: print('Too many redirects.') return def meta_redirect(content): c = content.lower() soup = BeautifulSoup(c, "html.parser") for result in soup.find_all(attrs={'http-equiv': 'refresh'}): if result: out = result["content"].split(";") if len(out) == 2: wait, text = out try: a, url = text.split('=', 1) return url.strip() except Exception: print(text) return None resolve, reason = self.try_resolve(url) if not resolve: # FIXME: inform that the domain does not resolve yield url return logging.debug(f"Making HTTP connection to {url}") headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:8.0) Gecko/20100101 Firefox/8.0' } try: response = requests.get(url, allow_redirects=True, headers=headers, timeout=15, verify=False) except Exception: # That one can fail (DNS for example) # FIXME: inform that the get failed yield url return if response.history is not None: for h in response.history: # Yeld the urls in the order we find them yield h.url yield response.url meta_redir_url = meta_redirect(response.content) if meta_redir_url is not None: depth += 1 if not meta_redir_url.startswith('http'): self.fex.decode(url) base = '{}://{}'.format(self.fex.get_scheme(), self.fex.get_host()) port = self.fex.get_port() if port is not None: base += f':{port}' if not meta_redir_url.startswith('/'): # relative redirect. resource_path has the initial '/' if self.fex.get_resource_path() is not None: base += self.fex.get_resource_path() if not base.endswith('/'): base += '/' meta_redir_url = base + meta_redir_url for url in self.get_urls(meta_redir_url, depth): yield url def url_list(self, url): cached = self._cache_get(url, 'list') if cached is not None: return cached list_urls = [] for u in self.get_urls(url): if u is None or u in list_urls: continue list_urls.append(u) self._cache_set(url, list_urls, 'list') return list_urls def dns_resolve(self, url): cached = self._cache_get(url, 'dns') if cached is not None: return cached self.fex.decode(url) host = self.fex.get_host().lower() ipv4 = None ipv6 = None if self.is_ip(host): if ':' in host: try: socket.inet_pton(socket.AF_INET6, host) ipv6 = [host] except Exception: pass else: try: socket.inet_aton(host) ipv4 = [host] except Exception: pass else: try: ipv4 = [str(ip) for ip in dns.resolver.query(host, 'A')] except Exception: logging.debug("No IPv4 address assigned to: " + host) try: ipv6 = [str(ip) for ip in dns.resolver.query(host, 'AAAA')] except Exception: logging.debug("No IPv6 address assigned to: " + host) self._cache_set(url, (ipv4, ipv6), 'dns') return ipv4, ipv6 def phish_query(self, url, key, query): cached = self._cache_get(query, 'phishtank') if cached is not None: return cached postfields = {'url': quote(query), 'format': 'json', 'app_key': key} response = requests.post(url, data=postfields) res = response.json() if res["meta"]["status"] == "success": if res["results"]["in_database"]: self._cache_set(query, res["results"]["phish_detail_page"], 'phishtank') return res["results"]["phish_detail_page"] else: # no information pass elif res["meta"]["status"] == 'error': # Inform the user? # errormsg = res["errortext"] pass return None def sphinxsearch(server, port, url, query): # WARNING: too dangerous to have on the public interface return '' """ if not sphinx: return None cached = _cache_get(query, 'sphinx') if cached is not None: return cached client = sphinxapi.SphinxClient() client.SetServer(server, port) client.SetMatchMode(2) client.SetConnectTimeout(5.0) result = [] res = client.Query(query) if res.get("matches") is not None: for ticket in res["matches"]: ticket_id = ticket["id"] ticket_link = url + str(ticket_id) result.append(ticket_link) _cache_set(query, result, 'sphinx') return result """ def vt_query_url(self, url, url_up, key, query, upload=True): cached = self._cache_get(query, 'vt') if cached is not None and cached[2] is not None: return cached parameters = {"resource": query, "apikey": key} if upload: parameters['scan'] = 1 response = requests.post(url, data=parameters) if response.text is None or len(response.text) == 0: return None res = response.json() msg = res["verbose_msg"] link = res.get("permalink") positives = res.get("positives") total = res.get("total") self._cache_set(query, (msg, link, positives, total), 'vt') return msg, link, positives, total def gsb_query(self, url, query): cached = self._cache_get(query, 'gsb') if cached is not None: return cached param = '1\n' + query response = requests.post(url, data=param) status = response.status_code if status == 200: self._cache_set(query, response.text, 'gsb') return response.text ''' def urlquery_query(url, key, query): return None cached = _cache_get(query, 'urlquery') if cached is not None: return cached try: urlquery.url = url urlquery.key = key response = urlquery.search(query) except Exception: return None if response['_response_']['status'] == 'ok': if response.get('reports') is not None: total_alert_count = 0 for r in response['reports']: total_alert_count += r['urlquery_alert_count'] total_alert_count += r['ids_alert_count'] total_alert_count += r['blacklist_alert_count'] _cache_set(query, total_alert_count, 'urlquery') return total_alert_count else: return None ''' def process_emails(self, emails, ignorelist, replacelist): to_return = list(set(emails)) for mail in reversed(to_return): for ignorelist_entry in ignorelist: if re.search(ignorelist_entry, mail, re.I): if mail in to_return: to_return.remove(mail) for k, v in list(replacelist.items()): if re.search(k, mail, re.I): if k in to_return: to_return.remove(k) to_return += v return to_return def whois(self, server, port, domain, ignorelist, replacelist): cached = self._cache_get(domain, 'whois') if cached is not None: return cached s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.settimeout(15) try: s.connect((server, port)) except Exception: print("Connection problems - check WHOIS server") print(("WHOIS request while problem occurred: ", domain)) print(("WHOIS server: {}:{}".format(server, port))) return None if domain.startswith('http'): self.fex.decode(domain) d = self.fex.get_domain().lower() else: d = domain s.send(("{}\r\n".format(d)).encode()) response = b'' while True: d = s.recv(4096) response += d if d == b'': break s.close() match = re.findall(r'[\w\.-]+@[\w\.-]+', response.decode()) emails = self.process_emails(match, ignorelist, replacelist) if len(emails) == 0: return None list_mail = list(set(emails)) self._cache_set(domain, list_mail, 'whois') return list_mail def pdnscircl(self, url, user, passwd, q): cached = self._cache_get(q, 'pdns') if cached is not None: return cached pdns = PyPDNS(url, basic_auth=(user, passwd)) response = pdns.query(q) all_uniq = [] for e in reversed(response): host = e['rrname'].lower() if host in all_uniq: continue else: all_uniq.append(host) response = (len(all_uniq), all_uniq[:5]) self._cache_set(q, response, 'pdns') return response def psslcircl(self, url, user, passwd, q): cached = self._cache_get(q, 'pssl') if cached is not None: return cached pssl = PyPSSL(url, basic_auth=(user, passwd)) response = pssl.query(q) if response.get(q) is not None: certinfo = response.get(q) entries = {} for sha1 in certinfo['certificates']: entries[sha1] = [] if certinfo['subjects'].get(sha1): for value in certinfo['subjects'][sha1]['values']: entries[sha1].append(value) self._cache_set(q, entries, 'pssl') return entries return None def eupi(self, url, key, q): cached = self._cache_get(q, 'eupi') if cached is not None: return cached eu = PyEUPI(key, url) response = eu.search_url(url=q) if response.get('results'): r = response.get('results')[0]['tag_label'] self._cache_set(q, r, 'eupi') return r eu.post_submission(q) return None def bgpranking(self, ip): cached = self._cache_get(ip, 'ipasn') if cached is not None: asn = cached['asn'] prefix = cached['prefix'] else: ipasn = IPASNHistory() response = ipasn.query(ip) if 'response' not in response: asn = None prefix = None entry = response['response'][list(response['response'].keys())[0]] if entry: self._cache_set(ip, entry, 'ipasn') asn = entry['asn'] prefix = entry['prefix'] else: asn = None prefix = None if not asn or not prefix: # asn, prefix, asn_descr, rank, position, known_asns return None, None, None, None, None, None cached = self._cache_get(ip, 'bgpranking') if cached is not None: return cached bgpranking = BGPRanking() response = bgpranking.query(asn, date=(date.today() - timedelta(1)).isoformat()) if 'response' not in response or not response['response']: return None, None, None, None, None, None to_return = (asn, prefix, response['response']['asn_description'], response['response']['ranking']['rank'], response['response']['ranking']['position'], response['response']['ranking']['total_known_asns']) self._cache_set(ip, to_return, 'bgpranking') return to_return def lookyloo(self, url): cached = self._cache_get(url, 'lookyloo') if cached is not None: return cached lookyloo = Lookyloo() lookyloo_perma_url = lookyloo.enqueue(url) if lookyloo_perma_url: self._cache_set(url, lookyloo_perma_url, 'lookyloo') return lookyloo_perma_url return None def _deserialize_cached(self, entry): to_return = {} redirects = [] h = self.cache.hgetall(entry) for key, value in h.items(): v = json.loads(value) if key == 'list': redirects = v continue to_return[key] = v return to_return, redirects def get_url_data(self, url): data, redirects = self._deserialize_cached(url) if data.get('dns') is not None: ipv4, ipv6 = data['dns'] ip_data = {} if ipv4 is not None: for ip in ipv4: info, _ = self._deserialize_cached(ip) ip_data[ip] = info if ipv6 is not None: for ip in ipv6: info, _ = self._deserialize_cached(ip) ip_data[ip] = info if len(ip_data) > 0: data.update(ip_data) return {url: data}, redirects def cached(self, url, digest=False): url_data, redirects = self.get_url_data(url) to_return = [url_data] for u in redirects: if u == url: continue data, redir = self.get_url_data(u) to_return.append(data) if digest: return {'result': to_return, 'digest': self.digest(to_return)} return {'result': to_return} def ip_details_digest(self, ips, all_info, all_asns, all_mails): to_return = '' for ip in ips: to_return += '\t' + ip + '\n' data = all_info[ip] if data.get('bgpranking'): to_return += '\t\tis announced by {} ({}). Position {}/{}.\n'.format( data['bgpranking'][2], data['bgpranking'][0], data['bgpranking'][4], data['bgpranking'][5]) all_asns.add('{} ({})'.format(data['bgpranking'][2], data['bgpranking'][0])) if data.get('whois'): all_mails.update(data.get('whois')) return to_return def digest(self, data): to_return = '' all_mails = set() all_asns = set() for entry in data: # Each URL we're redirected to for url, info in entry.items(): # info contains the information we got for the URL. to_return += '\n{}\n'.format(url) if 'whois' in info: all_mails.update(info['whois']) if 'lookyloo' in info: to_return += '\tLookyloo permanent URL: {}\n'.format( info['lookyloo']) if 'vt' in info and len(info['vt']) == 4: if info['vt'][2] is not None: to_return += '\t{} out of {} positive detections in VT - {}\n'.format( info['vt'][2], info['vt'][3], info['vt'][1]) else: to_return += '\t{} - {}\n'.format( info['vt'][0], info['vt'][1]) if 'gsb' in info: to_return += '\tKnown as malicious on Google Safe Browsing: {}\n'.format( info['gsb']) if 'phishtank' in info: to_return += '\tKnown on PhishTank: {}\n'.format( info['phishtank']) if 'dns' in info: ipv4, ipv6 = info['dns'] if ipv4 is not None: to_return += self.ip_details_digest( ipv4, info, all_asns, all_mails) if ipv6 is not None: to_return += self.ip_details_digest( ipv6, info, all_asns, all_mails) return to_return, list(all_mails), list(all_asns)
#!/usr/bin/python from pyfaup.faup import Faup import sys import codecs import binascii f = Faup() file_urls=codecs.open(sys.argv[1],'r','ascii',errors='ignore') urls=file_urls.readlines() for url in urls: url=url.replace('\n','') print("URL:[%s]" % (url)) f.decode(url) print("-----> Extracted TLD:%s" % f.get_tld())
if MX_values[0] > is_critical: publisher.warning(to_print) #Send to duplicate p.populate_set_out(filename, 'Duplicate') p.populate_set_out('mail;{}'.format(filename), 'alertHandler') msg = 'infoleak:automatic-detection="mail";{}'.format(filename) p.populate_set_out(msg, 'Tags') #create country statistics date = datetime.datetime.now().strftime("%Y%m") for mail in MX_values[1]: print('mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date)) p.populate_set_out('mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date), 'ModuleStats') faup.decode(mail) tld = faup.get()['tld'] server_statistics.hincrby('mail_by_tld:'+date, tld, MX_values[1][mail]) else: publisher.info(to_print) #create country statistics for mail in MX_values[1]: print('mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date)) p.populate_set_out('mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date), 'ModuleStats') prec_filename = filename else: publisher.debug("Script Mails is Idling 10s") print('Sleeping')
class Credential(AbstractModule): """ Credential module for AIL framework """ # Split username with spec. char or with upper case, distinguish start with upper REGEX_CRED = "[a-z]+|[A-Z]{3,}|[A-Z]{1,2}[a-z]+|[0-9]+" REDIS_KEY_NUM_USERNAME = '******' REDIS_KEY_NUM_PATH = 'uniqNumForUsername' REDIS_KEY_ALL_CRED_SET = 'AllCredentials' REDIS_KEY_ALL_CRED_SET_REV = 'AllCredentialsRev' REDIS_KEY_ALL_PATH_SET = 'AllPath' REDIS_KEY_ALL_PATH_SET_REV = 'AllPathRev' REDIS_KEY_MAP_CRED_TO_PATH = 'CredToPathMapping' def __init__(self): super(Credential, self).__init__() self.faup = Faup() self.regex_web = "((?:https?:\/\/)[\.-_0-9a-zA-Z]+\.[0-9a-zA-Z]+)" self.regex_cred = "[a-zA-Z0-9\\._-]+@[a-zA-Z0-9\\.-]+\.[a-zA-Z]{2,6}[\\rn :\_\-]{1,10}[a-zA-Z0-9\_\-]+" self.regex_site_for_stats = "@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,6}:" self.redis_cache_key = regex_helper.generate_redis_cache_key(self.module_name) # Database config_loader = ConfigLoader.ConfigLoader() self.server_cred = config_loader.get_redis_conn("ARDB_TermCred") self.server_statistics = config_loader.get_redis_conn("ARDB_Statistics") # Config values self.minimumLengthThreshold = config_loader.get_config_int("Credential", "minimumLengthThreshold") self.criticalNumberToAlert = config_loader.get_config_int("Credential", "criticalNumberToAlert") self.max_execution_time = 30 # Waiting time in secondes between to message proccessed self.pending_seconds = 10 # Send module state to logs self.redis_logger.info(f"Module {self.module_name} initialized") def compute(self, message): id, count = message.split() item = Item(id) item_content = item.get_content() # Extract all credentials all_credentials = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_cred, item.get_id(), item_content, max_time=self.max_execution_time) if all_credentials: nb_cred = len(all_credentials) message = f'Checked {nb_cred} credentials found.' all_sites = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_web, item.get_id(), item_content, max_time=self.max_execution_time) if all_sites: discovered_sites = ', '.join(all_sites) message += f' Related websites: {discovered_sites}' print(message) to_print = f'Credential;{item.get_source()};{item.get_date()};{item.get_basename()};{message};{item.get_id()}' #num of creds above tresh, publish an alert if nb_cred > self.criticalNumberToAlert: print(f"========> Found more than 10 credentials in this file : {item.get_id()}") self.redis_logger.warning(to_print) # Send to duplicate self.send_message_to_queue(item.get_id(), 'Duplicate') msg = f'infoleak:automatic-detection="credential";{item.get_id()}' self.send_message_to_queue(msg, 'Tags') site_occurence = regex_helper.regex_findall(self.module_name, self.redis_cache_key, self.regex_site_for_stats, item.get_id(), item_content, max_time=self.max_execution_time, r_set=False) creds_sites = {} for site in site_occurence: site_domain = site[1:-1].lower() if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1 for url in all_sites: self.faup.decode(url) domain = self.faup.get()['domain'] ## TODO: # FIXME: remove me, check faup versionb try: domain = domain.decode() except: pass if domain in creds_sites.keys(): creds_sites[domain] += 1 else: creds_sites[domain] = 1 for site, num in creds_sites.items(): # Send for each different site to moduleStats mssg = f'credential;{num};{site};{item.get_date()}' print(mssg) self.send_message_to_queue(mssg, 'ModuleStats') if all_sites: discovered_sites = ', '.join(all_sites) print(f"=======> Probably on : {discovered_sites}") date = datetime.now().strftime("%Y%m") for cred in all_credentials: maildomains = re.findall("@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,20}", cred.lower())[0] self.faup.decode(maildomains) tld = self.faup.get()['tld'] ## TODO: # FIXME: remove me try: tld = tld.decode() except: pass self.server_statistics.hincrby('credential_by_tld:'+date, tld, 1) else: self.redis_logger.info(to_print) print(f'found {nb_cred} credentials') # For searching credential in termFreq for cred in all_credentials: cred = cred.split('@')[0] #Split to ignore mail address # unique number attached to unique path uniq_num_path = self.server_cred.incr(Credential.REDIS_KEY_NUM_PATH) self.server_cred.hmset(Credential.REDIS_KEY_ALL_PATH_SET, {item.get_id(): uniq_num_path}) self.server_cred.hmset(Credential.REDIS_KEY_ALL_PATH_SET_REV, {uniq_num_path: item.get_id()}) # unique number attached to unique username uniq_num_cred = self.server_cred.hget(Credential.REDIS_KEY_ALL_CRED_SET, cred) if uniq_num_cred is None: # cred do not exist, create new entries uniq_num_cred = self.server_cred.incr(Credential.REDIS_KEY_NUM_USERNAME) self.server_cred.hmset(Credential.REDIS_KEY_ALL_CRED_SET, {cred: uniq_num_cred}) self.server_cred.hmset(Credential.REDIS_KEY_ALL_CRED_SET_REV, {uniq_num_cred: cred}) # Add the mapping between the credential and the path self.server_cred.sadd(Credential.REDIS_KEY_MAP_CRED_TO_PATH+'_'+str(uniq_num_cred), uniq_num_path) # Split credentials on capital letters, numbers, dots and so on # Add the split to redis, each split point towards its initial credential unique number splitedCred = re.findall(Credential.REGEX_CRED, cred) for partCred in splitedCred: if len(partCred) > self.minimumLengthThreshold: self.server_cred.sadd(partCred, uniq_num_cred)
## TODO: add MAIL trackers valid_mx = check_mx_record(set_mxdomains, dns_server) item_date = Item.get_item_date(item_id) num_valid_email = 0 for domain_mx in valid_mx: num_valid_email += len(dict_mxdomains_email[domain_mx]) for email in dict_mxdomains_email[domain_mx]: msg = 'mail;{};{};{}'.format(1, email, item_date) p.populate_set_out(msg, 'ModuleStats') # Create country stats faup.decode(email) tld = faup.get()['tld'] try: tld = tld.decode() except: pass server_statistics.hincrby( 'mail_by_tld:{}'.format(item_date), tld, 1) msg = 'Mails;{};{};{};Checked {} e-mail(s);{}'.format( Item.get_source(item_id), item_date, Item.get_item_basename(item_id), num_valid_email, item_id) if num_valid_email > mail_threshold: print('{} Checked {} e-mail(s)'.format( item_id, num_valid_email))
#!/usr/bin/python from pyfaup.faup import Faup import sys import codecs import binascii f = Faup() file_urls=codecs.open(sys.argv[1],'r','ascii',errors='ignore') urls=file_urls.readlines() for url in urls: url=url.replace('\n','') print("URL:[%s]" % (url)) f.decode(url, False) print("-----> Extracted TLD:%s" % f.get_tld())
import codecs import binascii # dynamically list all Faup's methods methods = [] for m in dir(Faup): if re.search("^get_", m): methods.append(m) methods.remove("get_version") # run # run if len(sys.argv) != 2: print "%s <file containing 1 url per line>" % sys.argv[0] sys.exit(0) f = Faup() file_urls = codecs.open(sys.argv[1], 'r', 'ascii', errors='ignore') urls = file_urls.readlines() for url in urls: url = url.replace('\n', '') print("URL:[%s]" % (url)) f.decode(url) # print("-----> Extracted TLD:%s" % f.get_tld()) # print("-----> Extracted TLD:%s" % f.get_domain_without_tld()) for m in methods: fct = getattr(f, m) print "\t%s : %s" % (re.sub("^get_", "", m), fct())
msg = 'infoleak:automatic-detection="mail";{}'.format( filename) p.populate_set_out(msg, 'Tags') #create country statistics date = datetime.datetime.now().strftime("%Y%m") for mail in MX_values[1]: print('mail;{};{};{}'.format( MX_values[1][mail], mail, PST.p_date)) p.populate_set_out( 'mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date), 'ModuleStats') faup.decode(mail) tld = faup.get()['tld'] server_statistics.hincrby('mail_by_tld:' + date, tld, MX_values[1][mail]) else: publisher.info(to_print) #create country statistics for mail in MX_values[1]: print('mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date)) p.populate_set_out( 'mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date), 'ModuleStats') prec_filename = filename
class Web(AbstractModule): """ Web module for AIL framework """ # Used to prevent concat with empty fields due to url parsing def avoidNone(self, a_string): if a_string is None: return "" else: return a_string def __init__(self): """ Init Web """ super(Web, self).__init__() # REDIS Cache self.r_serv2 = redis.StrictRedis( host=self.process.config.get("Redis_Cache", "host"), port=self.process.config.getint("Redis_Cache", "port"), db=self.process.config.getint("Redis_Cache", "db"), decode_responses=True) # Country to log as critical self.cc_critical = self.process.config.get("Url", "cc_critical") # FUNCTIONS # self.faup = Faup() # Protocol file path protocolsfile_path = os.path.join(os.environ['AIL_HOME'], self.process.config.get("Directories", "protocolsfile")) # Get all uri from protocolsfile (Used for Curve) uri_scheme = "" with open(protocolsfile_path, 'r') as scheme_file: for scheme in scheme_file: uri_scheme += scheme[:-1]+"|" uri_scheme = uri_scheme[:-1] self.url_regex = "((?i:"+uri_scheme + \ ")\://(?:[a-zA-Z0-9\.\-]+(?:\:[a-zA-Z0-9\.&%\$\-]+)*@)*(?:(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(?:25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|(?:[a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(?:com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(?:\:[0-9]+)*(?:/(?:$|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*)" self.prec_filename = None # Send module state to logs self.redis_logger.info("Module %s initialized" % (self.module_name)) def compute(self, message): """ Search for Web links from given message """ # Extract item filename, score = message.split() if self.prec_filename is None or filename != self.prec_filename: domains_list = set() PST = Paste.Paste(filename) client = ip2asn() detected_urls = PST.get_regex(self.url_regex) if len(detected_urls) > 0: to_print = 'Web;{};{};{};'.format( PST.p_source, PST.p_date, PST.p_name) self.redis_logger.info('{}Detected {} URL;{}'.format( to_print, len(detected_urls), PST.p_rel_path)) for url in detected_urls: self.redis_logger.debug("match regex: %s" % (url)) # self.redis_logger.debug("match regex search: %s"%(url)) to_send = "{} {} {}".format(url, PST._get_p_date(), filename) self.process.populate_set_out(to_send, 'Url') self.redis_logger.debug("url_parsed: %s" % (to_send)) self.faup.decode(url) domain = self.faup.get_domain() subdomain = self.faup.get_subdomain() self.redis_logger.debug('{} Published'.format(url)) if subdomain is not None: # TODO: # FIXME: remove me try: subdomain = subdomain.decode() except: pass if domain is not None: # TODO: # FIXME: remove me try: domain = domain.decode() except: pass domains_list.add(domain) hostl = self.avoidNone(subdomain) + self.avoidNone(domain) try: socket.setdefaulttimeout(1) ip = socket.gethostbyname(hostl) # If the resolver is not giving any IPv4 address, # ASN/CC lookup is skip. l = client.lookup(ip, qType='IP') except ipaddress.AddressValueError: self.redis_logger.debug( f'ASN/CC lookup failed for IP {ip}') continue except: self.redis_logger.debug( f'Resolver IPv4 address failed for host {hostl}') continue cc = getattr(l, 'cc') asn = '' if getattr(l, 'asn') is not None: asn = getattr(l, 'asn')[2:] # remobe b' # EU is not an official ISO 3166 code (but used by RIPE # IP allocation) if cc is not None and cc != "EU": self.redis_logger.debug('{};{};{};{}'.format(hostl, asn, cc, pycountry.countries.get(alpha_2=cc).name)) if cc == self.cc_critical: to_print = 'Url;{};{};{};Detected {} {}'.format( PST.p_source, PST.p_date, PST.p_name, hostl, cc) self.redis_logger.info(to_print) else: self.redis_logger.debug('{};{};{}'.format(hostl, asn, cc)) A_values = lib_refine.checking_A_record(self.r_serv2, domains_list) if A_values[0] >= 1: pprint.pprint(A_values) # self.redis_logger.info('Url;{};{};{};Checked {} URL;{}'.format( # PST.p_source, PST.p_date, PST.p_name, A_values[0], PST.p_rel_path)) self.prec_filename = filename
class LibInjection(AbstractModule): """docstring for LibInjection module.""" def __init__(self): super(LibInjection, self).__init__() self.faup = Faup() config_loader = ConfigLoader() self.server_statistics = config_loader.get_redis_conn("ARDB_Statistics") self.redis_logger.info(f"Module: {self.module_name} Launched") def compute(self, message): url, id = message.split() self.faup.decode(url) url_parsed = self.faup.get() ## TODO: # FIXME: remove me try: resource_path = url_parsed['resource_path'].encode() except: resource_path = url_parsed['resource_path'] ## TODO: # FIXME: remove me try: query_string = url_parsed['query_string'].encode() except: query_string = url_parsed['query_string'] result_path = {'sqli' : False} result_query = {'sqli' : False} if resource_path is not None: result_path = pylibinjection.detect_sqli(resource_path) #print(f'path is sqli : {result_path}') if query_string is not None: result_query = pylibinjection.detect_sqli(query_string) #print(f'query is sqli : {result_query}') if result_path['sqli'] is True or result_query['sqli'] is True: item = Item(id) item_id = item.get_id() print(f"Detected (libinjection) SQL in URL: {item_id}") print(urllib.request.unquote(url)) to_print = f'LibInjection;{item.get_source()};{item.get_date()};{item.get_basename()};Detected SQL in URL;{item_id}' self.redis_logger.warning(to_print) # Send to duplicate self.send_message_to_queue(item_id, 'Duplicate') # Add tag msg = f'infoleak:automatic-detection="sql-injection";{item_id}' self.send_message_to_queue(msg, 'Tags') #statistics ## TODO: # FIXME: remove me try: tld = url_parsed['tld'].decode() except: tld = url_parsed['tld'] if tld is not None: date = datetime.now().strftime("%Y%m") self.server_statistics.hincrby(f'SQLInjection_by_tld:{date}', tld, 1)
msg = 'infoleak:automatic-detection="credential";{}'.format(filepath) p.populate_set_out(msg, 'Tags') #Put in form, count occurences, then send to moduleStats creds_sites = {} site_occurence = re.findall(regex_site_for_stats, content) for site in site_occurence: site_domain = site[1:-1] if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1 for url in sites: faup.decode(url) domain = faup.get()['domain'] ## TODO: # FIXME: remove me try: domain = domain.decode() except: pass if domain in creds_sites.keys(): creds_sites[domain] += 1 else: creds_sites[domain] = 1 for site, num in creds_sites.items(): # Send for each different site to moduleStats mssg = 'credential;{};{};{}'.format(num, site, paste.p_date) print(mssg)
for h in hashlist_md5: misp.add_hashes(new_event, md5=h) for h in hashlist_sha1: misp.add_hashes(new_event, sha1=h) for h in hashlist_sha256: misp.add_hashes(new_event, sha256=h) if (len(hashlist_md5) > 0) or (len(hashlist_sha1) > 0) or (len(hashlist_sha256) > 0): for tag in hash_only_tags: misp.add_tag(new_event, tag) # Add IOCs and expanded information to MISP for entry in urllist: ids_flag = True f.decode(entry) domainname = f.get_domain().decode('utf-8', 'ignore') hostname = f.get_host().decode('utf-8', 'ignore') try: schema = f.get_scheme().decode('utf-8', 'ignore') except: schema = False if debug: syslog.syslog(domainname) if domainname not in excludelist: if domainname in internallist: misp.add_named_attribute(new_event, 'link', entry, category='Internal reference', to_ids=False,
item_id, item_content, max_time=max_execution_time, r_set=False) creds_sites = {} for site in site_occurence: site_domain = site[1:-1].lower() if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1 for url in all_sites: faup.decode(url) domain = faup.get()['domain'] ## TODO: # FIXME: remove me try: domain = domain.decode() except: pass if domain in creds_sites.keys(): creds_sites[domain] += 1 else: creds_sites[domain] = 1 for site, num in creds_sites.items( ): # Send for each different site to moduleStats mssg = 'credential;{};{};{}'.format(
domain_url = 'http://{}'.format(domain) print() print() print( '\033[92m------------------START CRAWLER------------------\033[0m' ) print('crawler type: {}'.format(type_hidden_service)) print( '\033[92m-------------------------------------------------\033[0m' ) print('url: {}'.format(url)) print('domain: {}'.format(domain)) print('domain_url: {}'.format(domain_url)) faup.decode(domain) onion_domain = faup.get()['domain'].decode() if not r_onion.sismember( 'blacklist_{}'.format(type_hidden_service), domain) and not r_onion.sismember( 'blacklist_{}'.format(type_hidden_service), onion_domain): date = datetime.datetime.now().strftime("%Y%m%d") date_month = datetime.datetime.now().strftime("%Y%m") if not r_onion.sismember( 'month_{}_up:{}'.format( type_hidden_service, date_month), domain) and not r_onion.sismember(