""" import time from packages import Paste from pubsublogger import publisher from Helper import Process if __name__ == "__main__": publisher.port = 6380 publisher.channel = "Script" config_section = 'Tokenize' p = Process(config_section) # LOGGING # publisher.info("Tokeniser started") while True: message = p.get_from_set() print message if message is not None: paste = Paste.Paste(message) for word, score in paste._get_top_words().items(): if len(word) >= 4: msg = '{} {} {}'.format(paste.p_path, word, score) p.populate_set_out(msg) else: publisher.debug("Tokeniser is idling 10s") time.sleep(10) print "sleepin"
time.sleep(1) continue # Creating the full filepath filename = os.path.join(os.environ['AIL_HOME'], p.config.get("Directories", "pastes"), paste) dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) decoded = base64.standard_b64decode(gzip64encoded) with open(filename, 'wb') as f: f.write(decoded) '''try: decoded2 = gunzip_bytes_obj(decoded) except: decoded2 ='' type = magic.from_buffer(decoded2, mime=True) if type!= 'text/x-c++' and type!= 'text/html' and type!= 'text/x-c' and type!= 'text/x-python' and type!= 'text/x-php' and type!= 'application/xml' and type!= 'text/x-shellscript' and type!= 'text/plain' and type!= 'text/x-diff' and type!= 'text/x-ruby': print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') print(filename) print(type) print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') ''' p.populate_set_out(filename) processed_paste += 1
message += ' Related websites: {}'.format((', '.join(sites_set))) to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_rel_path) print('\n '.join(creds)) #num of creds above tresh, publish an alert if len(creds) > criticalNumberToAlert: print("========> Found more than 10 credentials in this file : {}". format(filepath)) publisher.warning(to_print) #Send to duplicate p.populate_set_out(filepath, 'Duplicate') msg = 'infoleak:automatic-detection="credential";{}'.format( filepath) p.populate_set_out(msg, 'Tags') #Put in form, count occurences, then send to moduleStats creds_sites = {} site_occurence = re.findall(regex_site_for_stats, content) for site in site_occurence: site_domain = site[1:-1] if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1
sites_set = set(re.findall(regex_web, content)) message = 'Checked {} credentials found.'.format(len(creds)) if sites_set: message += ' Related websites: {}'.format( (', '.join(sites_set)) ) to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_rel_path) print('\n '.join(creds)) #num of creds above tresh, publish an alert if len(creds) > criticalNumberToAlert: print("========> Found more than 10 credentials in this file : {}".format( filepath )) publisher.warning(to_print) #Send to duplicate p.populate_set_out(filepath, 'Duplicate') msg = 'infoleak:automatic-detection="credential";{}'.format(filepath) p.populate_set_out(msg, 'Tags') #Put in form, count occurences, then send to moduleStats creds_sites = {} site_occurence = re.findall(regex_site_for_stats, content) for site in site_occurence: site_domain = site[1:-1] if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1 for url in sites:
class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, type, url, domain, original_paste, super_father, *args, **kwargs): self.type = type self.original_paste = original_paste self.super_father = super_father self.start_urls = url self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") self.full_date = datetime.datetime.now().strftime("%Y%m%d") self.date_month = datetime.datetime.now().strftime("%Y%m") config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join( self.p.config.get("Directories", "crawled"), date) self.crawled_paste_filemame = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date) self.crawled_screenshot = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date) def start_requests(self): yield SplashRequest(self.start_urls, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_paste}, args={ 'html': 1, 'wait': 10, 'render_all': 1, 'har': 1, 'png': 1 }) def parse(self, response): #print(response.headers) #print(response.status) if response.status == 504: # down ? print('504 detected') elif response.status != 200: print('other response: {}'.format(response.status)) #print(error_log) #detect connection to proxy refused error_log = (json.loads(response.body.decode())) if (error_log['info']['text'] == 'Connection to proxy refused' ): print('Connection to proxy refused') else: #avoid filename too big if len(self.domains[0]) > 215: UUID = self.domains[0][-215:] + str(uuid.uuid4()) else: UUID = self.domains[0] + str(uuid.uuid4()) filename_paste = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_screenshot = os.path.join(self.crawled_screenshot, UUID + '.png') # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd( '{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd( 'month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('{}_metadata:{}'.format( self.type, self.domains[0])): self.r_serv_onion.hset( '{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) self.r_serv_onion.hset( '{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) #create paste metadata self.r_serv_metadata.hset( 'paste_metadata:' + filename_paste, 'super_father', self.super_father) self.r_serv_metadata.hset( 'paste_metadata:' + filename_paste, 'father', response.meta['father']) self.r_serv_metadata.hset( 'paste_metadata:' + filename_paste, 'domain', self.domains[0]) self.r_serv_metadata.hset( 'paste_metadata:' + filename_paste, 'real_link', response.url) self.r_serv_metadata.sadd( 'paste_children:' + response.meta['father'], filename_paste) dirname = os.path.dirname(filename_screenshot) if not os.path.exists(dirname): os.makedirs(dirname) size_screenshot = (len(response.data['png']) * 3) / 4 if size_screenshot < 5000000: #bytes with open(filename_screenshot, 'wb') as f: f.write( base64.standard_b64decode( response.data['png'].encode())) with open(filename_screenshot + 'har.txt', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set #lext = LinkExtractor(deny_domains=self.domains, unique=True) #for link in lext.extract_links(response): # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): yield SplashRequest( link.url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste}, args={ 'html': 1, 'png': 1, 'render_all': 1, 'har': 1, 'wait': 10 }) def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) if failure.check(ResponseNeverReceived): request = failure.request url = request.meta['splash']['args']['url'] father = request.meta['father'] self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': father}, args={ 'html': 1, 'png': 1, 'render_all': 1, 'har': 1, 'wait': 10 }) else: print('failure') #print(failure) print(failure.type) #print(failure.request.meta['item']) ''' #if isinstance(failure.value, HttpError): elif failure.check(HttpError): # you can get the response response = failure.value.response print('HttpError') self.logger.error('HttpError on %s', response.url) #elif isinstance(failure.value, DNSLookupError): elif failure.check(DNSLookupError): # this is the original request request = failure.request print(DNSLookupError) print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) ''' def save_crawled_paste(self, filename, content): if os.path.isfile(filename): print('File: {} already exist in submitted pastes'.format( filename)) return False try: gzipencoded = gzip.compress(content.encode()) gzip64encoded = base64.standard_b64encode(gzipencoded).decode() except: print("file error: {}".format(filename)) return False # send paste to Global relay_message = "{0} {1}".format(filename, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(filename) self.p.populate_set_out(msg, 'Tags') return True
r_serv2, PST.get_regex(email_regex)) if MX_values[0] >= 1: PST.__setattr__(channel, MX_values) PST.save_attribute_redis(channel, (MX_values[0], list(MX_values[1]))) pprint.pprint(MX_values) to_print = 'Mails;{};{};{};Checked {} e-mail(s)'.\ format(PST.p_source, PST.p_date, PST.p_name, MX_values[0]) if MX_values[0] > is_critical: publisher.warning(to_print) #Send to duplicate p.populate_set_out(filename, 'Duplicate') else: publisher.info(to_print) #Send to ModuleStats for mail in MX_values[1]: print 'mail;{};{};{}'.format(1, mail, PST.p_date) p.populate_set_out('mail;{};{};{}'.format(1, mail, PST.p_date), 'ModuleStats') p.populate_set_out('mail;{}'.format(filename), 'BrowseWarningPaste') prec_filename = filename else: publisher.debug("Script Mails is Idling 10s") print 'Sleeping' time.sleep(10)
all_cards = re.findall(regex, content) if len(all_cards) > 0: print 'All matching', all_cards creditcard_set = set([]) for card in all_cards: clean_card = re.sub('[^0-9]', '', card) if lib_refine.is_luhn_valid(clean_card): print clean_card, 'is valid' creditcard_set.add(clean_card) paste.__setattr__(channel, creditcard_set) paste.save_attribute_redis(channel, creditcard_set) pprint.pprint(creditcard_set) to_print = 'CreditCard;{};{};{};'.format( paste.p_source, paste.p_date, paste.p_name) if (len(creditcard_set) > 0): publisher.warning('{}Checked {} valid number(s);{}'.format( to_print, len(creditcard_set), paste.p_path)) #Send to duplicate p.populate_set_out(filename, 'Duplicate') #send to Browse_warning_paste p.populate_set_out('creditcard;{}'.format(filename), 'BrowseWarningPaste') else: publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_path)) else: publisher.debug("Script creditcard is idling 1m") time.sleep(10)
creditcard_set = set([]) for card in all_cards: clean_card = re.sub('[^0-9]', '', card) clean_card = clean_card if lib_refine.is_luhn_valid(clean_card): print(clean_card, 'is valid') creditcard_set.add(clean_card) paste.__setattr__(channel, creditcard_set) paste.save_attribute_redis(channel, creditcard_set) pprint.pprint(creditcard_set) to_print = 'CreditCard;{};{};{};'.format( paste.p_source, paste.p_date, paste.p_name) if (len(creditcard_set) > 0): publisher.warning('{}Checked {} valid number(s);{}'.format( to_print, len(creditcard_set), paste.p_rel_path)) print('{}Checked {} valid number(s);{}'.format( to_print, len(creditcard_set), paste.p_rel_path)) #Send to duplicate p.populate_set_out(filename, 'Duplicate') msg = 'infoleak:automatic-detection="credit-card";{}'.format(filename) p.populate_set_out(msg, 'Tags') else: publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_rel_path)) else: publisher.debug("Script creditcard is idling 1m") time.sleep(10)
delta = date_to - date_from # timedelta l_date = [] for i in range(delta.days + 1): date = date_from + datetime.timedelta(i) l_date.append(date.strftime('%Y%m%d')) return l_date config_section = 'Keys' p = Process(config_section) r_tags = redis.StrictRedis(host=p.config.get("ARDB_Tags", "host"), port=p.config.getint("ARDB_Tags", "port"), db=p.config.getint("ARDB_Tags", "db"), decode_responses=True) tag = 'infoleak:automatic-detection="pgp-message"' # get tag first/last seen first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen') last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen') l_dates = substract_date(first_seen, last_seen) # get all tagged items for date in l_dates: daily_tagged_items = r_tags.smembers('{}:{}'.format(tag, date)) for item in daily_tagged_items: p.populate_set_out(item, 'PgpDump')
delta = date_to - date_from # timedelta l_date = [] for i in range(delta.days + 1): date = date_from + datetime.timedelta(i) l_date.append( date.strftime('%Y%m%d') ) return l_date config_section = 'Keys' p = Process(config_section) r_tags = redis.StrictRedis( host=p.config.get("ARDB_Tags", "host"), port=p.config.getint("ARDB_Tags", "port"), db=p.config.getint("ARDB_Tags", "db"), decode_responses=True) tag = 'infoleak:automatic-detection="pgp-message"' # get tag first/last seen first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen') last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen') l_dates = substract_date(first_seen, last_seen) # get all tagged items for date in l_dates: daily_tagged_items = r_tags.smembers('{}:{}'.format(tag, date)) for item in daily_tagged_items: p.populate_set_out(item, 'PgpDump')
# Send a notification only when the member is in the set if dico_setname_to_redis[str(the_set)] in server_term.smembers(TrackedTermsNotificationEnabled_Name): # create mail body mail_body = ("AIL Framework,\n" "New occurrence for term: " + dico_setname_to_redis[str(the_set)] + "\n" ''+full_paste_url + filename) # Send to every associated email adress for email in server_term.smembers(TrackedTermsNotificationEmailsPrefix_Name + dico_setname_to_redis[str(the_set)]): sendEmailNotification(email, 'Term', mail_body) # tag paste for tag in server_term.smembers(TrackedTermsNotificationTagsPrefix_Name + dico_setname_to_redis[str(the_set)]): msg = '{};{}'.format(tag, filename) p.populate_set_out(msg, 'Tags') print(the_set, "matched in", filename) set_name = 'set_' + dico_setname_to_redis[the_set] new_to_the_set = server_term.sadd(set_name, filename) new_to_the_set = True if new_to_the_set == 1 else False #consider the num of occurence of this set set_value = int(server_term.hincrby(timestamp, dico_setname_to_redis[the_set], int(1))) # FIXME - avoid using per paste as a set is checked over the entire paste #1 term per paste if new_to_the_set: set_value_perPaste = int(server_term.hincrby("per_paste_" + str(timestamp), dico_setname_to_redis[the_set], int(1))) server_term.zincrby("per_paste_" + curr_set, dico_setname_to_redis[the_set], float(1)) server_term.zincrby(curr_set, dico_setname_to_redis[the_set], float(1))
sites= re.findall(regex_web, content) #Use to count occurences sites_set = set(re.findall(regex_web, content)) message = 'Checked {} credentials found.'.format(len(creds)) if sites_set: message += ' Related websites: {}'.format(', '.join(sites_set)) to_print = 'Credential;{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message) print('\n '.join(creds)) if len(creds) > critical: print("========> Found more than 10 credentials in this file : {}".format(filepath)) publisher.warning(to_print) #Send to duplicate p.populate_set_out(filepath, 'Duplicate') #Send to BrowseWarningPaste p.populate_set_out('credential;{}'.format(filepath), 'BrowseWarningPaste') #Put in form, count occurences, then send to moduleStats creds_sites = {} site_occurence = re.findall(regex_site_for_stats, content) for site in site_occurence: site_domain = site[1:-1] if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1 for url in sites: faup.decode(url)
class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs): self.type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) self.arg_crawler = { 'html': crawler_options['html'], 'wait': 10, 'render_all': 1, 'har': crawler_options['har'], 'png': crawler_options['png']} config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date_str ) self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date_str ) self.crawled_har = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str ) self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot") ) def start_requests(self): yield SplashRequest( self.start_urls, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_item, 'root_key': None}, args=self.arg_crawler ) def parse(self,response): #print(response.headers) #print(response.status) if response.status == 504: # down ? print('504 detected') elif response.status != 200: print('other response: {}'.format(response.status)) #print(error_log) #detect connection to proxy refused error_log = (json.loads(response.body.decode())) if(error_log['info']['text'] == 'Connection to proxy refused'): print('Connection to proxy refused') else: #avoid filename too big if len(self.domains[0]) > 215: UUID = self.domains[0][-215:]+str(uuid.uuid4()) else: UUID = self.domains[0]+str(uuid.uuid4()) filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_har = os.path.join(self.crawled_har, UUID) # # TODO: modify me # save new paste on disk if self.save_crawled_paste(relative_filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) # create root_key if self.root_key is None: self.root_key = relative_filename_paste # Create/Update crawler history self.r_serv_onion.zadd('crawler_history_{}:{}:{}'.format(self.type, self.domains[0], self.port), self.date_epoch, self.root_key) # Update domain port number all_domain_ports = self.r_serv_onion.hget('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports') if all_domain_ports: all_domain_ports = all_domain_ports.split(';') else: all_domain_ports = [] if self.port not in all_domain_ports: all_domain_ports.append(self.port) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports)) #create paste metadata self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url) self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], relative_filename_paste) if 'png' in response.data: size_screenshot = (len(response.data['png'])*3) /4 if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto image_content = base64.standard_b64decode(response.data['png'].encode()) hash = sha256(image_content).hexdigest() img_dir_path = os.path.join(hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12]) filename_img = os.path.join(self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] +'.png') dirname = os.path.dirname(filename_img) if not os.path.exists(dirname): os.makedirs(dirname) if not os.path.exists(filename_img): with open(filename_img, 'wb') as f: f.write(image_content) # add item metadata self.r_serv_metadata.hset('paste_metadata:{}'.format(relative_filename_paste), 'screenshot', hash) # add sha256 metadata self.r_serv_onion.sadd('screenshot:{}'.format(hash), relative_filename_paste) if 'har' in response.data: dirname = os.path.dirname(filename_har) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename_har+'.json', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set #lext = LinkExtractor(deny_domains=self.domains, unique=True) #for link in lext.extract_links(response): # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): yield SplashRequest( link.url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste, 'root_key': response.meta['root_key']}, args=self.arg_crawler ) def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) if failure.check(ResponseNeverReceived): request = failure.request url = request.meta['splash']['args']['url'] father = request.meta['father'] self.logger.error('Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) if response: response_root_key = response.meta['root_key'] else: response_root_key = None yield SplashRequest( url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={'father': father, 'root_key': response.meta['root_key']}, args=self.arg_crawler ) else: print('failure') #print(failure) print(failure.type) #print(failure.request.meta['item']) ''' #if isinstance(failure.value, HttpError): elif failure.check(HttpError): # you can get the response response = failure.value.response print('HttpError') self.logger.error('HttpError on %s', response.url) #elif isinstance(failure.value, DNSLookupError): elif failure.check(DNSLookupError): # this is the original request request = failure.request print(DNSLookupError) print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) ''' def save_crawled_paste(self, filename, content): if os.path.isfile(filename): print('File: {} already exist in submitted pastes'.format(filename)) return False try: gzipencoded = gzip.compress(content.encode()) gzip64encoded = base64.standard_b64encode(gzipencoded).decode() except: print("file error: {}".format(filename)) return False # send paste to Global relay_message = "{0} {1}".format(filename, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(filename) self.p.populate_set_out(msg, 'Tags') return True
dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) decoded = base64.standard_b64decode(gzip64encoded) with open(filename, 'wb') as f: f.write(decoded) '''try: decoded2 = gunzip_bytes_obj(decoded) except: decoded2 ='' type = magic.from_buffer(decoded2, mime=True) if type!= 'text/x-c++' and type!= 'text/html' and type!= 'text/x-c' and type!= 'text/x-python' and type!= 'text/x-php' and type!= 'application/xml' and type!= 'text/x-shellscript' and type!= 'text/plain' and type!= 'text/x-diff' and type!= 'text/x-ruby': print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') print(filename) print(type) print('-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------') ''' # remove PASTES_FOLDER from item path (crawled item + submited) if PASTES_FOLDERS in paste: paste = paste.replace(PASTES_FOLDERS, '', 1) p.populate_set_out(paste) processed_paste+=1
MX_values = lib_refine.checking_MX_record( r_serv2, PST.get_regex(email_regex), addr_dns) if MX_values[0] >= 1: PST.__setattr__(channel, MX_values) PST.save_attribute_redis(channel, (MX_values[0], list(MX_values[1]))) to_print = 'Mails;{};{};{};Checked {} e-mail(s);{}'.\ format(PST.p_source, PST.p_date, PST.p_name, MX_values[0], PST.p_path) if MX_values[0] > is_critical: publisher.warning(to_print) #Send to duplicate p.populate_set_out(filename, 'Duplicate') p.populate_set_out('mail;{}'.format(filename), 'alertHandler') msg = 'infoleak:automatic-detection="mail";{}'.format(filename) p.populate_set_out(msg, 'Tags') #create country statistics date = datetime.datetime.now().strftime("%Y%m") for mail in MX_values[1]: print('mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date)) p.populate_set_out('mail;{};{};{}'.format(MX_values[1][mail], mail, PST.p_date), 'ModuleStats') faup.decode(mail) tld = faup.get()['tld'] server_statistics.hincrby('mail_by_tld:'+date, tld, MX_values[1][mail])
r_serv2, PST.get_regex(email_regex)) if MX_values[0] >= 1: PST.__setattr__(channel, MX_values) PST.save_attribute_redis( channel, (MX_values[0], list(MX_values[1]))) pprint.pprint(MX_values) to_print = 'Mails;{};{};{};Checked {} e-mail(s);{}'.\ format(PST.p_source, PST.p_date, PST.p_name, MX_values[0], PST.p_path) if MX_values[0] > is_critical: publisher.warning(to_print) #Send to duplicate p.populate_set_out(filename, 'Duplicate') p.populate_set_out('mail;{}'.format(filename), 'alertHandler') else: publisher.info(to_print) #Send to ModuleStats for mail in MX_values[1]: print 'mail;{};{};{}'.format(1, mail, PST.p_date) p.populate_set_out( 'mail;{};{};{}'.format(1, mail, PST.p_date), 'ModuleStats') prec_filename = filename else:
'onion_metadata:{}'.format(domain), 'first_seen'): r_onion.sadd( 'onion_crawler_priority_queue', msg) print('send to priority queue') else: r_onion.sadd('onion_crawler_queue', msg) #p.populate_set_out(msg, 'Crawler') else: for url in fetch(p, r_cache, urls, domains_list, path): publisher.info('{}Checked {};{}'.format( to_print, url, PST.p_path)) p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') msg = 'infoleak:automatic-detection="onion";{}'.format( PST.p_path) p.populate_set_out(msg, 'Tags') else: publisher.info('{}Onion related;{}'.format( to_print, PST.p_path)) prec_filename = filename else: publisher.debug("Script url is Idling 10s") #print('Sleeping') time.sleep(10)
class AbstractModule(ABC): """ Abstract Module class """ def __init__(self, module_name=None, queue_name=None, logger_channel='Script'): """ Init Module module_name: str; set the module name if different from the instance ClassName queue_name: str; set the queue name if different from the instance ClassName logger_channel: str; set the logger channel name, 'Script' by default """ # Module name if provided else instance className self.module_name = module_name if module_name else self._module_name() # Module name if provided else instance className self.queue_name = queue_name if queue_name else self._module_name() # Init Redis Logger self.redis_logger = publisher # Port of the redis instance used by pubsublogger self.redis_logger.port = 6380 # Channel name to publish logs # # TODO: refactor logging # If provided could be a namespaced channel like script:<ModuleName> self.redis_logger.channel = logger_channel # Run module endlessly self.proceed = True # Waiting time in secondes between two proccessed messages self.pending_seconds = 10 # Setup the I/O queues self.process = Process(self.queue_name) def get_message(self): """ Get message from the Redis Queue (QueueIn) Input message can change between modules ex: '<item id>' """ return self.process.get_from_set() def send_message_to_queue(self, message, queue_name=None): """ Send message to queue :param message: message to send in queue :param queue_name: queue or module name ex: send_to_queue(item_id, 'Global') """ self.process.populate_set_out(message, queue_name) def run(self): """ Run Module endless process """ # Endless loop processing messages from the input queue while self.proceed: # Get one message (ex:item id) from the Redis Queue (QueueIn) message = self.get_message() if message: try: # Module processing with the message from the queue self.compute(message) except Exception as err: trace = traceback.format_tb(err.__traceback__) self.redis_logger.critical(f"Error in module {self.module_name}: {err}") self.redis_logger.critical(f"Module {self.module_name} input message: {message}") self.redis_logger.critical(trace) print() print(f"ERROR: {err}") print(f'MESSAGE: {message}') print('TRACEBACK:') for line in trace: print(line) else: self.computeNone() # Wait before next process self.redis_logger.debug(f"{self.module_name}, waiting for new message, Idling {self.pending_seconds}s") time.sleep(self.pending_seconds) def _module_name(self): """ Returns the instance class name (ie. the Module Name) """ return self.__class__.__name__ @abstractmethod def compute(self, message): """ Main method of the Module to implement """ pass def computeNone(self): """ Method of the Module when there is no message """ pass
type_hidden_service, domain), 'first_seen', date) r_onion.hset( '{}_metadata:{}'.format( type_hidden_service, domain), 'last_seen', date) else: #r_onion.hincrby('{}_link_up'.format(type_hidden_service), url, 1) if r_onion.sismember( 'month_{}_up:{}'.format( type_hidden_service, date_month), domain) and r_serv_metadata.exists( 'paste_children:' + paste): msg = 'infoleak:automatic-detection="{}";{}'.format( type_hidden_service, paste) p.populate_set_out(msg, 'Tags') # last check r_onion.hset( '{}_metadata:{}'.format(type_hidden_service, domain), 'last_check', date) # last_father r_onion.hset( '{}_metadata:{}'.format(type_hidden_service, domain), 'paste_parent', paste) # add onion screenshot history # add crawled days
to_print = 'Onion;{};{};{};'.format(PST.p_source, PST.p_date, PST.p_name) if len(domains_list) > 0: publisher.warning('{}Detected {} .onion(s);{}'.format( to_print, len(domains_list), PST.p_path)) now = datetime.datetime.now() path = os.path.join( 'onions', str(now.year).zfill(4), str(now.month).zfill(2), str(now.day).zfill(2), str(int(time.mktime(now.utctimetuple())))) to_print = 'Onion;{};{};{};'.format( PST.p_source, PST.p_date, PST.p_name) for url in fetch(p, r_cache, urls, domains_list, path): publisher.warning('{}Checked {};{}'.format( to_print, url, PST.p_path)) p.populate_set_out('onion;{}'.format(PST.p_path), 'BrowseWarningPaste') else: publisher.info('{}Onion related;{}'.format( to_print, PST.p_path)) prec_filename = filename else: publisher.debug("Script url is Idling 10s") print 'Sleeping' time.sleep(10) message = p.get_from_set()
message += ' Related websites: {}'.format((', '.join(sites_set))) to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_path) print('\n '.join(creds)) #num of creds above tresh, publish an alert if len(creds) > criticalNumberToAlert: print("========> Found more than 10 credentials in this file : {}". format(filepath)) publisher.warning(to_print) #Send to duplicate p.populate_set_out(filepath, 'Duplicate') #Send to alertHandler msg = 'credential;{}'.format(filepath) p.populate_set_out(msg, 'alertHandler') #Put in form, count occurences, then send to moduleStats creds_sites = {} site_occurence = re.findall(regex_site_for_stats, content) for site in site_occurence: site_domain = site[1:-1] if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1 for url in sites:
delta = date_to - date_from # timedelta l_date = [] for i in range(delta.days + 1): date = date_from + datetime.timedelta(i) l_date.append( date.strftime('%Y%m%d') ) return l_date config_section = 'Global' p = Process(config_section) r_tags = redis.StrictRedis( host=p.config.get("ARDB_Tags", "host"), port=p.config.getint("ARDB_Tags", "port"), db=p.config.getint("ARDB_Tags", "db"), decode_responses=True) tag = 'infoleak:automatic-detection="bitcoin-address"' # get tag first/last seen first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen') last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen') l_dates = substract_date(first_seen, last_seen) # get all tagged items for date in l_dates: daily_tagged_items = r_tags.smembers('{}:{}'.format(tag, date)) for item in daily_tagged_items: p.populate_set_out(item)
bname = os.path.basename(filename) tmp_dict[bname] = [] with open(os.path.join(args.d, filename), 'r') as f: patterns = [r'%s' % (re.escape(s.strip())) for s in f] tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE) prec_filename = None while True: filename = p.get_from_set() if filename is None: publisher.debug("Script Categ is Idling 10s") print('Sleeping') time.sleep(10) continue paste = Paste.Paste(filename) content = paste.get_p_content() for categ, pattern in tmp_dict.items(): found = set(re.findall(pattern, content)) if len(found) >= matchingThreshold: msg = '{} {}'.format(paste.p_rel_path, len(found)) print(msg, categ) p.populate_set_out(msg, categ) publisher.info('Categ;{};{};{};Detected {} as {};{}'.format( paste.p_source, paste.p_date, paste.p_name, len(found), categ, paste.p_rel_path))
# FUNCTIONS # tmp_string = "Lines script Subscribed to channel {} and Start to publish \ on channel Longlines, Shortlines" publisher.info(tmp_string) while True: try: message = p.get_from_set() print message if message is not None: PST = Paste.Paste(message) else: publisher.debug("Tokeniser is idling 10s") time.sleep(10) continue # FIXME do it in the paste class lines_infos = PST.get_lines_info() PST.save_attribute_redis("p_nb_lines", lines_infos[0]) PST.save_attribute_redis("p_max_length_line", lines_infos[1]) # FIXME Not used. PST.store.sadd("Pastes_Objects", PST.p_path) if lines_infos[1] < args.max: p.populate_set_out(PST.p_path, 'LinesShort') else: p.populate_set_out(PST.p_path, 'LinesLong') except IOError: print "CRC Checksum Error on : ", PST.p_path
domain = re.findall(url_regex, url) if len(domain) > 0: domain = domain[0][4] else: continue if not r_onion.sismember('month_onion_up:{}'.format(date_month), domain) and not r_onion.sismember('onion_down:'+date , domain): if not r_onion.sismember('onion_domain_crawler_queue', domain): print('send to onion crawler') r_onion.sadd('onion_domain_crawler_queue', domain) msg = '{};{}'.format(url,PST.p_path) r_onion.sadd('onion_crawler_queue', msg) #p.populate_set_out(msg, 'Crawler') else: for url in fetch(p, r_cache, urls, domains_list, path): publisher.info('{}Checked {};{}'.format(to_print, url, PST.p_path)) p.populate_set_out('onion;{}'.format(PST.p_path), 'alertHandler') msg = 'infoleak:automatic-detection="onion";{}'.format(PST.p_path) p.populate_set_out(msg, 'Tags') else: publisher.info('{}Onion related;{}'.format(to_print, PST.p_path)) prec_filename = filename else: publisher.debug("Script url is Idling 10s") #print('Sleeping') time.sleep(10) message = p.get_from_set()
class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, type, crawler_options, date, requested_mode, url, domain, port, original_item, *args, **kwargs): self.type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) self.arg_crawler = { 'html': crawler_options['html'], 'wait': 10, 'render_all': 1, 'har': crawler_options['har'], 'png': crawler_options['png'] } config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join( self.p.config.get("Directories", "crawled"), date_str) self.crawled_paste_filemame = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date_str) self.crawled_har = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date_str) self.crawled_screenshot = os.path.join( os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot")) def start_requests(self): yield SplashRequest(self.start_urls, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={ 'father': self.original_item, 'root_key': None }, args=self.arg_crawler) def parse(self, response): #print(response.headers) #print(response.status) if response.status == 504: # down ? print('504 detected') elif response.status != 200: print('other response: {}'.format(response.status)) #print(error_log) #detect connection to proxy refused error_log = (json.loads(response.body.decode())) if (error_log['info']['text'] == 'Connection to proxy refused' ): print('Connection to proxy refused') else: #avoid filename too big if len(self.domains[0]) > 215: UUID = self.domains[0][-215:] + str(uuid.uuid4()) else: UUID = self.domains[0] + str(uuid.uuid4()) filename_paste_full = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_har = os.path.join(self.crawled_har, UUID) # # TODO: modify me # save new paste on disk if self.save_crawled_paste(relative_filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd( '{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd( 'month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('{}_metadata:{}'.format( self.type, self.domains[0])): self.r_serv_onion.hset( '{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) # create root_key if self.root_key is None: self.root_key = relative_filename_paste # Create/Update crawler history self.r_serv_onion.zadd( 'crawler_history_{}:{}:{}'.format( self.type, self.domains[0], self.port), self.date_epoch, self.root_key) # Update domain port number all_domain_ports = self.r_serv_onion.hget( '{}_metadata:{}'.format(self.type, self.domains[0]), 'ports') if all_domain_ports: all_domain_ports = all_domain_ports.split(';') else: all_domain_ports = [] if self.port not in all_domain_ports: all_domain_ports.append(self.port) self.r_serv_onion.hset( '{}_metadata:{}'.format( self.type, self.domains[0]), 'ports', ';'.join(all_domain_ports)) #create paste metadata self.r_serv_metadata.hset( 'paste_metadata:{}'.format(relative_filename_paste), 'super_father', self.root_key) self.r_serv_metadata.hset( 'paste_metadata:{}'.format(relative_filename_paste), 'father', response.meta['father']) self.r_serv_metadata.hset( 'paste_metadata:{}'.format(relative_filename_paste), 'domain', '{}:{}'.format(self.domains[0], self.port)) self.r_serv_metadata.hset( 'paste_metadata:{}'.format(relative_filename_paste), 'real_link', response.url) self.r_serv_metadata.sadd( 'paste_children:' + response.meta['father'], relative_filename_paste) if 'png' in response.data: size_screenshot = (len(response.data['png']) * 3) / 4 if size_screenshot < 5000000 or self.requested_mode: #bytes or manual/auto image_content = base64.standard_b64decode( response.data['png'].encode()) hash = sha256(image_content).hexdigest() img_dir_path = os.path.join( hash[0:2], hash[2:4], hash[4:6], hash[6:8], hash[8:10], hash[10:12]) filename_img = os.path.join( self.crawled_screenshot, 'screenshot', img_dir_path, hash[12:] + '.png') dirname = os.path.dirname(filename_img) if not os.path.exists(dirname): os.makedirs(dirname) if not os.path.exists(filename_img): with open(filename_img, 'wb') as f: f.write(image_content) # add item metadata self.r_serv_metadata.hset( 'paste_metadata:{}'.format( relative_filename_paste), 'screenshot', hash) # add sha256 metadata self.r_serv_onion.sadd( 'screenshot:{}'.format(hash), relative_filename_paste) if 'har' in response.data: dirname = os.path.dirname(filename_har) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename_har + '.json', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set #lext = LinkExtractor(deny_domains=self.domains, unique=True) #for link in lext.extract_links(response): # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): yield SplashRequest(link.url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={ 'father': relative_filename_paste, 'root_key': response.meta['root_key'] }, args=self.arg_crawler) def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) if failure.check(ResponseNeverReceived): request = failure.request url = request.meta['splash']['args']['url'] father = request.meta['father'] self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) if response: response_root_key = response.meta['root_key'] else: response_root_key = None yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='render.json', meta={ 'father': father, 'root_key': response.meta['root_key'] }, args=self.arg_crawler) else: print('failure') #print(failure) print(failure.type) #print(failure.request.meta['item']) ''' #if isinstance(failure.value, HttpError): elif failure.check(HttpError): # you can get the response response = failure.value.response print('HttpError') self.logger.error('HttpError on %s', response.url) #elif isinstance(failure.value, DNSLookupError): elif failure.check(DNSLookupError): # this is the original request request = failure.request print(DNSLookupError) print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) ''' def save_crawled_paste(self, filename, content): if os.path.isfile(filename): print('File: {} already exist in submitted pastes'.format( filename)) return False try: gzipencoded = gzip.compress(content.encode()) gzip64encoded = base64.standard_b64encode(gzipencoded).decode() except: print("file error: {}".format(filename)) return False # send paste to Global relay_message = "{0} {1}".format(filename, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(filename) self.p.populate_set_out(msg, 'Tags') return True
url_regex = "("+uri_scheme+")\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*" while True: if message is not None: filename, score = message.split() if prec_filename is None or filename != prec_filename: domains_list = [] PST = Paste.Paste(filename) client = ip2asn() for x in PST.get_regex(url_regex): matching_url = re.search(url_regex, PST.get_p_content()) url = matching_url.group(0) to_send = "{} {} {}".format(url, PST._get_p_date(), filename) p.populate_set_out(to_send, 'Url') faup.decode(url) domain = faup.get_domain() subdomain = faup.get_subdomain() f1 = None domains_list.append(domain) publisher.debug('{} Published'.format(url)) if f1 == "onion": print domain hostl = unicode(avoidNone(subdomain)+avoidNone(domain)) try:
all_cards = re.findall(regex, content) if len(all_cards) > 0: print 'All matching', all_cards creditcard_set = set([]) for card in all_cards: clean_card = re.sub('[^0-9]', '', card) if lib_refine.is_luhn_valid(clean_card): print clean_card, 'is valid' creditcard_set.add(clean_card) paste.__setattr__(channel, creditcard_set) paste.save_attribute_redis(channel, creditcard_set) pprint.pprint(creditcard_set) to_print = 'CreditCard;{};{};{};'.format( paste.p_source, paste.p_date, paste.p_name) if (len(creditcard_set) > 0): publisher.warning('{}Checked {} valid number(s)'.format( to_print, len(creditcard_set))) #Send to duplicate p.populate_set_out(filename, 'Duplicate') #send to Browse_warning_paste p.populate_set_out('creditcard;{}'.format(filename), 'BrowseWarningPaste') else: publisher.info('{}CreditCard related'.format(to_print)) else: publisher.debug("Script creditcard is idling 1m") time.sleep(10)
# FUNCTIONS # tmp_string = "Lines script Subscribed to channel {} and Start to publish \ on channel Longlines, Shortlines" publisher.info(tmp_string) while True: try: message = p.get_from_set() print(message) if message is not None: PST = Paste.Paste(message) else: publisher.debug("Tokeniser is idling 10s") time.sleep(10) continue # FIXME do it in the paste class lines_infos = PST.get_lines_info() PST.save_attribute_redis("p_nb_lines", lines_infos[0]) PST.save_attribute_redis("p_max_length_line", lines_infos[1]) # FIXME Not used. PST.store.sadd("Pastes_Objects", PST.p_rel_path) print(PST.p_rel_path) if lines_infos[1] < args.max: p.populate_set_out( PST.p_rel_path , 'LinesShort') else: p.populate_set_out( PST.p_rel_path , 'LinesLong') except IOError: print("CRC Checksum Error on : ", PST.p_rel_path)
# Sent to the logging a description of the module publisher.info("Tags module started") # Endless loop getting messages from the input queue while True: # Get one message from the input queue message = p.get_from_set() if message is None: publisher.debug("{} queue is empty, waiting 10s".format(config_section)) time.sleep(10) continue else: tag, path = message.split(';') # add the tag to the tags word_list res = server.sadd('list_tags', tag) if res == 1: print("new tags added : {}".format(tag)) # add the path to the tag set res = server.sadd(tag, path) if res == 1: print("new paste: {}".format(path)) print(" tagged: {}".format(tag)) server_metadata.sadd('tag:'+path, tag) curr_date = datetime.date.today() serv_statistics.hincrby(curr_date.strftime("%Y%m%d"),'paste_tagged:'+tag, 1) p.populate_set_out(message, 'MISP_The_Hive_feeder')
new_file_md5) else: filename = '{}_{}'.format(filename, new_file_md5) # continue if new file already exist if os.path.isfile(filename): print('ignore duplicated file') continue print('new file: {}'.format(filename)) # ignore duplicate else: print('ignore duplicated file') continue # create subdir dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, 'wb') as f: f.write(decoded) paste = filename # remove PASTES_FOLDER from if PASTES_FOLDERS in paste: paste = paste.replace(PASTES_FOLDERS, '', 1) p.populate_set_out(paste) processed_paste += 1
class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, type, url, domain,original_paste, super_father, *args, **kwargs): self.type = type self.original_paste = original_paste self.super_father = super_father self.start_urls = url self.domains = [domain] date = datetime.datetime.now().strftime("%Y/%m/%d") self.full_date = datetime.datetime.now().strftime("%Y%m%d") self.date_month = datetime.datetime.now().strftime("%Y%m") config_section = 'Crawler' self.p = Process(config_section) self.r_cache = redis.StrictRedis( host=self.p.config.get("Redis_Cache", "host"), port=self.p.config.getint("Redis_Cache", "port"), db=self.p.config.getint("Redis_Cache", "db"), decode_responses=True) self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.r_serv_metadata = redis.StrictRedis( host=self.p.config.get("ARDB_Metadata", "host"), port=self.p.config.getint("ARDB_Metadata", "port"), db=self.p.config.getint("ARDB_Metadata", "db"), decode_responses=True) self.r_serv_onion = redis.StrictRedis( host=self.p.config.get("ARDB_Onion", "host"), port=self.p.config.getint("ARDB_Onion", "port"), db=self.p.config.getint("ARDB_Onion", "db"), decode_responses=True) self.crawler_path = os.path.join(self.p.config.get("Directories", "crawled"), date ) self.crawled_paste_filemame = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "pastes"), self.p.config.get("Directories", "crawled"), date ) self.crawled_screenshot = os.path.join(os.environ['AIL_HOME'], self.p.config.get("Directories", "crawled_screenshot"), date ) def start_requests(self): yield SplashRequest( self.start_urls, self.parse, #errback=self.errback_catcher, endpoint='render.json', meta={'father': self.original_paste}, args={ 'html': 1, 'wait': 10, 'render_all': 1, 'har': 1, 'png': 1} ) def parse(self,response): #print(response.headers) #print(response.status) if response.status == 504: # down ? print('504 detected') elif response.status != 200: print('other response: {}'.format(response.status)) #print(error_log) #detect connection to proxy refused error_log = (json.loads(response.body.decode())) if(error_log['info']['text'] == 'Connection to proxy refused'): print('Connection to proxy refused') else: UUID = self.domains[0]+str(uuid.uuid4()) filename_paste = os.path.join(self.crawled_paste_filemame, UUID) relative_filename_paste = os.path.join(self.crawler_path, UUID) filename_screenshot = os.path.join(self.crawled_screenshot, UUID +'.png') # save new paste on disk if self.save_crawled_paste(filename_paste, response.data['html']): # add this paste to the domain crawled set # TODO: # FIXME: put this on cache ? #self.r_serv_onion.sadd('temp:crawled_domain_pastes:{}'.format(self.domains[0]), filename_paste) self.r_serv_onion.sadd('{}_up:{}'.format(self.type, self.full_date), self.domains[0]) self.r_serv_onion.sadd('full_{}_up'.format(self.type), self.domains[0]) self.r_serv_onion.sadd('month_{}_up:{}'.format(self.type, self.date_month), self.domains[0]) # create onion metadata if not self.r_serv_onion.exists('{}_metadata:{}'.format(self.type, self.domains[0])): self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'first_seen', self.full_date) self.r_serv_onion.hset('{}_metadata:{}'.format(self.type, self.domains[0]), 'last_seen', self.full_date) #create paste metadata self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'super_father', self.super_father) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'father', response.meta['father']) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'domain', self.domains[0]) self.r_serv_metadata.hset('paste_metadata:'+filename_paste, 'real_link', response.url) self.r_serv_metadata.sadd('paste_children:'+response.meta['father'], filename_paste) dirname = os.path.dirname(filename_screenshot) if not os.path.exists(dirname): os.makedirs(dirname) size_screenshot = (len(response.data['png'])*3) /4 if size_screenshot < 5000000: #bytes with open(filename_screenshot, 'wb') as f: f.write(base64.standard_b64decode(response.data['png'].encode())) with open(filename_screenshot+'har.txt', 'wb') as f: f.write(json.dumps(response.data['har']).encode()) # save external links in set #lext = LinkExtractor(deny_domains=self.domains, unique=True) #for link in lext.extract_links(response): # self.r_serv_onion.sadd('domain_{}_external_links:{}'.format(self.type, self.domains[0]), link.url) # self.r_serv_metadata.sadd('paste_{}_external_links:{}'.format(self.type, filename_paste), link.url) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): yield SplashRequest( link.url, self.parse, #errback=self.errback_catcher, endpoint='render.json', meta={'father': relative_filename_paste}, args={ 'html': 1, 'png': 1, 'render_all': 1, 'har': 1, 'wait': 10} ) ''' def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) print('failure') #print(failure) print(failure.type) #print(failure.request.meta['item']) #if isinstance(failure.value, HttpError): if failure.check(HttpError): # you can get the response response = failure.value.response print('HttpError') self.logger.error('HttpError on %s', response.url) #elif isinstance(failure.value, DNSLookupError): elif failure.check(DNSLookupError): # this is the original request request = failure.request print(DNSLookupError) print('DNSLookupError') self.logger.error('DNSLookupError on %s', request.url) #elif isinstance(failure.value, TimeoutError): elif failure.check(TimeoutError): request = failure.request print('TimeoutError') print(TimeoutError) self.logger.error('TimeoutError on %s', request.url) ''' def save_crawled_paste(self, filename, content): if os.path.isfile(filename): print('File: {} already exist in submitted pastes'.format(filename)) return False try: gzipencoded = gzip.compress(content.encode()) gzip64encoded = base64.standard_b64encode(gzipencoded).decode() except: print("file error: {}".format(filename)) return False # send paste to Global relay_message = "{0} {1}".format(filename, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(filename) self.p.populate_set_out(msg, 'Tags') return True
delta = date_to - date_from # timedelta l_date = [] for i in range(delta.days + 1): date = date_from + datetime.timedelta(i) l_date.append(date.strftime('%Y%m%d')) return l_date config_section = 'Global' p = Process(config_section) r_tags = redis.StrictRedis(host=p.config.get("ARDB_Tags", "host"), port=p.config.getint("ARDB_Tags", "port"), db=p.config.getint("ARDB_Tags", "db"), decode_responses=True) tag = 'infoleak:automatic-detection="bitcoin-address"' # get tag first/last seen first_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'first_seen') last_seen = r_tags.hget('tag_metadata:{}'.format(tag), 'last_seen') l_dates = substract_date(first_seen, last_seen) # get all tagged items for date in l_dates: daily_tagged_items = r_tags.smembers('{}:{}'.format(tag, date)) for item in daily_tagged_items: p.populate_set_out(item)
url_regex = "(http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*" while True: if message is not None: filename, score = message.split() if prec_filename is None or filename != prec_filename: domains_list = [] PST = Paste.Paste(filename) client = ip2asn() for x in PST.get_regex(url_regex): scheme, credential, subdomain, domain, host, tld, \ port, resource_path, query_string, f1, f2, f3, \ f4 = x domains_list.append(domain) p.populate_set_out(x, 'Url') publisher.debug('{} Published'.format(x)) if f1 == "onion": print domain hostl = unicode(subdomain + domain) try: socket.setdefaulttimeout(2) ip = socket.gethostbyname(unicode(hostl)) except: # If the resolver is not giving any IPv4 address, # ASN/CC lookup is skip. continue try:
relay_message = "{0} {1}".format(paste_name, gzip64encoded) #relay_message = b" ".join( [paste_name, gzip64encoded] ) digest = hashlib.sha1(gzip64encoded.encode('utf8')).hexdigest() # Avoid any duplicate coming from any sources if operation_mode == 1: if server.exists(digest): # Content already exists #STATS duplicated_paste_per_feeder[feeder_name] += 1 else: # New content # populate Global OR populate another set based on the feeder_name if feeder_name in FEED_QUEUE_MAPPING: p.populate_set_out(relay_message, FEED_QUEUE_MAPPING[feeder_name]) else: p.populate_set_out(relay_message, 'Mixer') server.sadd(digest, feeder_name) server.expire(digest, ttl_key) # Keep duplicate coming from different sources elif operation_mode == 2: # Filter to avoid duplicate content = server.get('HASH_' + paste_name) if content is None: # New content # Store in redis for filtering server.set('HASH_' + paste_name, digest) server.sadd(paste_name, feeder_name)
r_serv2, PST.get_regex(email_regex)) if MX_values[0] >= 1: PST.__setattr__(channel, MX_values) PST.save_attribute_redis( channel, (MX_values[0], list(MX_values[1]))) pprint.pprint(MX_values) to_print = 'Mails;{};{};{};Checked {} e-mail(s);{}'.\ format(PST.p_source, PST.p_date, PST.p_name, MX_values[0], PST.p_path) if MX_values[0] > is_critical: publisher.warning(to_print) #Send to duplicate p.populate_set_out(filename, 'Duplicate') p.populate_set_out('mail;{}'.format(filename), 'BrowseWarningPaste') else: publisher.info(to_print) #Send to ModuleStats for mail in MX_values[1]: print 'mail;{};{};{}'.format(1, mail, PST.p_date) p.populate_set_out( 'mail;{};{};{}'.format(1, mail, PST.p_date), 'ModuleStats') prec_filename = filename else:
int(1))) # Add in set only if term is not in the blacklist if low_word not in server_term.smembers(BlackListTermsSet_Name): #consider the num of occurence of this term server_term.zincrby(curr_set, low_word, float(score)) #1 term per paste server_term.zincrby("per_paste_" + curr_set, low_word, float(1)) #Add more info for tracked terms check_if_tracked_term(low_word, filename) #send to RegexForTermsFrequency to_send = "{} {} {}".format(filename, timestamp, word) p.populate_set_out(to_send, 'RegexForTermsFrequency') else: if generate_new_graph: generate_new_graph = False print('Building graph') today = datetime.date.today() year = today.year month = today.month lib_words.create_curve_with_word_file(r_serv1, csv_path, wordfile_path, year, month) publisher.debug("Script Curve is Idling")
def main(): publisher.port = 6380 publisher.channel = "Script" config_section = 'DomClassifier' p = Process(config_section) addr_dns = p.config.get("DomClassifier", "dns") publisher.info("""ZMQ DomainClassifier is Running""") c = DomainClassifier.domainclassifier.Extract(rawtext="", nameservers=[addr_dns]) cc = p.config.get("DomClassifier", "cc") cc_tld = p.config.get("DomClassifier", "cc_tld") while True: try: item_id = p.get_from_set() if item_id is None: publisher.debug("Script DomClassifier is idling 1s") time.sleep(1) continue item_content = item_basic.get_item_content(item_id) mimetype = item_basic.get_item_mimetype(item_id) item_basename = item_basic.get_basename(item_id) item_source = item_basic.get_source(item_id) item_date = item_basic.get_item_date(item_id) if mimetype.split('/')[0] == "text": c.text(rawtext=item_content) c.potentialdomain() c.validdomain(passive_dns=True, extended=False) print(c.vdomain) if c.vdomain and d4.is_passive_dns_enabled(): for dns_record in c.vdomain: p.populate_set_out(dns_record) localizeddomains = c.include(expression=cc_tld) if localizeddomains: print(localizeddomains) publisher.warning( f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {cc_tld};{item_id}" ) localizeddomains = c.localizedomain(cc=cc) if localizeddomains: print(localizeddomains) publisher.warning( f"DomainC;{item_source};{item_date};{item_basename};Checked {localizeddomains} located in {cc};{item_id}" ) except IOError: print("CRC Checksum Failed on :", item_id) publisher.error( f"Duplicate;{item_source};{item_date};{item_basename};CRC Checksum Failed" )
# Script is the default channel used for the modules. publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = 'Tags' # Setup the I/O queues p = Process(config_section) # Sent to the logging a description of the module publisher.info("Tags module started") # Endless loop getting messages from the input queue while True: # Get one message from the input queue message = p.get_from_set() if message is None: publisher.debug( "{} queue is empty, waiting 10s".format(config_section)) time.sleep(10) continue else: print(message) tag, item_id = message.split(';') Tag.add_tag("item", tag, item_id) p.populate_set_out(message, 'MISP_The_Hive_feeder')
if __name__ == '__main__': # If you wish to use an other port of channel, do not forget to run a subscriber accordingly (see launch_logs.sh) # Port of the redis instance used by pubsublogger publisher.port = 6380 # Script is the default channel used for the modules. publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = '<section name>' # Setup the I/O queues p = Process(config_section) # Sent to the logging a description of the module publisher.info("<description of the module>") # Endless loop getting messages from the input queue while True: # Get one message from the input queue message = p.get_from_set() if message is None: publisher.debug("{} queue is empty, waiting".format(config_section)) time.sleep(1) continue # Do something with the message from the queue something_has_been_done = do_something(message) # (Optional) Send that thing to the next queue p.populate_set_out(something_has_been_done)
bname = os.path.basename(filename) tmp_dict[bname] = [] with open(os.path.join(args.d, filename), 'r') as f: patterns = [r'%s' % re.escape(s.strip()) for s in f] tmp_dict[bname] = re.compile('|'.join(patterns), re.IGNORECASE) prec_filename = None while True: filename = p.get_from_set() if filename is None: publisher.debug("Script Categ is Idling 10s") print 'Sleeping' time.sleep(10) continue paste = Paste.Paste(filename) content = paste.get_p_content() for categ, pattern in tmp_dict.items(): found = set(re.findall(pattern, content)) if len(found) > 0: msg = '{} {}'.format(paste.p_path, len(found)) print msg, categ p.populate_set_out(msg, categ) publisher.info( 'Categ;{};{};{};Detected {} as {}'.format( paste.p_source, paste.p_date, paste.p_name, len(found), categ))
sites_set = set(re.findall(regex_web, content)) message = 'Checked {} credentials found.'.format(len(creds)) if sites_set: message += ' Related websites: {}'.format(', '.join(sites_set)) to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_path) print('\n '.join(creds)) #num of creds above tresh, publish an alert if len(creds) > criticalNumberToAlert: print("========> Found more than 10 credentials in this file : {}".format(filepath)) publisher.warning(to_print) #Send to duplicate p.populate_set_out(filepath, 'Duplicate') #Send to alertHandler p.populate_set_out('credential;{}'.format(filepath), 'alertHandler') #Put in form, count occurences, then send to moduleStats creds_sites = {} site_occurence = re.findall(regex_site_for_stats, content) for site in site_occurence: site_domain = site[1:-1] if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1 for url in sites: faup.decode(url)
if sites_set: message += ' Related websites: {}'.format(', '.join(sites_set)) to_print = 'Credential;{};{};{};{};{}'.format(paste.p_source, paste.p_date, paste.p_name, message, paste.p_path) print('\n '.join(creds)) if len(creds) > critical: print("========> Found more than 10 credentials in this file : {}". format(filepath)) publisher.warning(to_print) #Send to duplicate p.populate_set_out(filepath, 'Duplicate') #Send to BrowseWarningPaste p.populate_set_out('credential;{}'.format(filepath), 'BrowseWarningPaste') #Put in form, count occurences, then send to moduleStats creds_sites = {} site_occurence = re.findall(regex_site_for_stats, content) for site in site_occurence: site_domain = site[1:-1] if site_domain in creds_sites.keys(): creds_sites[site_domain] += 1 else: creds_sites[site_domain] = 1 for url in sites:
# Port of the redis instance used by pubsublogger publisher.port = 6380 # Script is the default channel used for the modules. publisher.channel = 'Script' # Section name in bin/packages/modules.cfg config_section = 'PreProcessFeed' # Setup the I/O queues p = Process(config_section) # Sent to the logging a description of the module publisher.info("<description of the module>") # Endless loop getting messages from the input queue while True: # Get one message from the input queue message = p.get_from_set() if message is None: publisher.debug( "{} queue is empty, waiting".format(config_section)) print("queue empty") time.sleep(1) continue # Do something with the message from the queue new_message = do_something(message) # (Optional) Send that thing to the next queue p.populate_set_out(new_message)
url_regex = "(http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*" while True: if message is not None: filename, score = message.split() if prec_filename is None or filename != prec_filename: domains_list = [] PST = Paste.Paste(filename) client = ip2asn() for x in PST.get_regex(url_regex): scheme, credential, subdomain, domain, host, tld, \ port, resource_path, query_string, f1, f2, f3, \ f4 = x domains_list.append(domain) p.populate_set_out(x, 'Url') publisher.debug('{} Published'.format(x)) if f1 == "onion": print domain hostl = unicode(subdomain+domain) try: socket.setdefaulttimeout(2) ip = socket.gethostbyname(unicode(hostl)) except: # If the resolver is not giving any IPv4 address, # ASN/CC lookup is skip. continue try:
# LOGGING # publisher.info("Feed Script started to receive & publish.") while True: message = p.get_from_set() # Recovering the streamed message informations. if message is not None: splitted = message.split() if len(splitted) == 2: paste, gzip64encoded = splitted else: # TODO Store the name of the empty paste inside a Redis-list. print "Empty Paste: not processed" publisher.debug("Empty Paste: {0} not processed".format(paste)) continue else: print "Empty Queues: Waiting..." time.sleep(1) continue # Creating the full filepath filename = os.path.join(os.environ["AIL_HOME"], p.config.get("Directories", "pastes"), paste) dirname = os.path.dirname(filename) if not os.path.exists(dirname): os.makedirs(dirname) with open(filename, "wb") as f: f.write(base64.standard_b64decode(gzip64encoded)) p.populate_set_out(filename)
class TorSplashSpider(Spider): name = 'TorSplashSpider' def __init__(self, splash_url, type, crawler_options, date, requested_mode, url, domain, port, cookies, original_item, *args, **kwargs): self.splash_url = splash_url self.domain_type = type self.requested_mode = requested_mode self.original_item = original_item self.root_key = None self.start_urls = url self.domains = [domain] self.port = str(port) date_str = '{}/{}/{}'.format(date['date_day'][0:4], date['date_day'][4:6], date['date_day'][6:8]) self.full_date = date['date_day'] self.date_month = date['date_month'] self.date_epoch = int(date['epoch']) self.png = crawler_options['png'] self.har = crawler_options['har'] self.cookies = cookies config_section = 'Crawler' self.p = Process(config_section) self.item_dir = os.path.join( self.p.config.get("Directories", "crawled"), date_str) config_loader = ConfigLoader.ConfigLoader() self.har_dir = os.path.join( config_loader.get_files_directory('har'), date_str) config_loader = None self.r_serv_log_submit = redis.StrictRedis( host=self.p.config.get("Redis_Log_submit", "host"), port=self.p.config.getint("Redis_Log_submit", "port"), db=self.p.config.getint("Redis_Log_submit", "db"), decode_responses=True) self.root_key = None def build_request_arg(self, cookies): return { 'wait': 10, 'resource_timeout': 30, # /!\ Weird behaviour if timeout < resource_timeout /!\ 'timeout': 30, 'cookies': cookies, 'lua_source': script_cookie } def start_requests(self): l_cookies = self.build_request_arg(self.cookies) yield SplashRequest(self.start_urls, self.parse, errback=self.errback_catcher, endpoint='execute', meta={ 'father': self.original_item, 'current_url': self.start_urls }, args=l_cookies) # # TODO: remove duplicate and anchor def parse(self, response): #print(response.headers) #print(response.status) if response.status == 504: # no response #print('504 detected') pass # LUA ERROR # # TODO: print/display errors elif 'error' in response.data: if (response.data['error'] == 'network99'): ## splash restart ## error_retry = request.meta.get('error_retry', 0) if error_retry < 3: error_retry += 1 url = request.meta['current_url'] father = request.meta['father'] self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 10s ...', url) time.sleep(10) yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='execute', cache_args=['lua_source'], meta={ 'father': father, 'current_url': url, 'error_retry': error_retry }, args=self.build_request_arg( response.cookiejar)) else: print('Connection to proxy refused') else: print(response.data['error']) elif response.status != 200: print('other response: {}'.format(response.status)) # detect connection to proxy refused error_log = (json.loads(response.body.decode())) print(error_log) #elif crawlers.is_redirection(self.domains[0], response.data['last_url']): # pass # ignore response else: item_id = crawlers.create_item_id(self.item_dir, self.domains[0]) self.save_crawled_item(item_id, response.data['html']) crawlers.create_item_metadata(item_id, self.domains[0], response.data['last_url'], self.port, response.meta['father']) if self.root_key is None: self.root_key = item_id crawlers.add_domain_root_item(item_id, self.domain_type, self.domains[0], self.date_epoch, self.port) crawlers.create_domain_metadata(self.domain_type, self.domains[0], self.port, self.full_date, self.date_month) if 'cookies' in response.data: all_cookies = response.data['cookies'] else: all_cookies = [] # SCREENSHOT if 'png' in response.data and self.png: sha256_string = Screenshot.save_crawled_screeshot( response.data['png'], 5000000, f_save=self.requested_mode) if sha256_string: Screenshot.save_item_relationship( sha256_string, item_id) Screenshot.save_domain_relationship( sha256_string, self.domains[0]) # HAR if 'har' in response.data and self.har: crawlers.save_har(self.har_dir, item_id, response.data['har']) le = LinkExtractor(allow_domains=self.domains, unique=True) for link in le.extract_links(response): l_cookies = self.build_request_arg(all_cookies) yield SplashRequest(link.url, self.parse, errback=self.errback_catcher, endpoint='execute', meta={ 'father': item_id, 'current_url': link.url }, args=l_cookies) def errback_catcher(self, failure): # catch all errback failures, self.logger.error(repr(failure)) if failure.check(ResponseNeverReceived): ## DEBUG ## self.logger.error(failure.request) if failure.value.response: self.logger.error(failure.value.response) ## ----- ## # Extract request metadata url = failure.request.meta['current_url'] father = failure.request.meta['father'] l_cookies = self.build_request_arg( failure.request.meta['splash']['args']['cookies']) # Check if Splash restarted if not crawlers.is_splash_reachable(self.splash_url): self.logger.error( 'Splash, ResponseNeverReceived for %s, retry in 30s ...', url) time.sleep(30) yield SplashRequest(url, self.parse, errback=self.errback_catcher, endpoint='execute', meta={ 'father': father, 'current_url': url }, args=l_cookies) else: self.logger.error(failure.type) self.logger.error(failure.getErrorMessage()) def save_crawled_item(self, item_id, item_content): gzip64encoded = crawlers.save_crawled_item(item_id, item_content) # Send item to queue # send paste to Global relay_message = "{0} {1}".format(item_id, gzip64encoded) self.p.populate_set_out(relay_message, 'Mixer') # increase nb of paste by feeder name self.r_serv_log_submit.hincrby("mixer_cache:list_feeder", "crawler", 1) # tag crawled paste msg = 'infoleak:submission="crawler";{}'.format(item_id) self.p.populate_set_out(msg, 'Tags')
relay_message = "{0} {1}".format(paste_name, gzip64encoded) #relay_message = b" ".join( [paste_name, gzip64encoded] ) digest = hashlib.sha1(gzip64encoded.encode('utf8')).hexdigest() # Avoid any duplicate coming from any sources if operation_mode == 1: if server.exists(digest): # Content already exists #STATS duplicated_paste_per_feeder[feeder_name] += 1 else: # New content # populate Global OR populate another set based on the feeder_name if feeder_name in FEED_QUEUE_MAPPING: p.populate_set_out(relay_message, FEED_QUEUE_MAPPING[feeder_name]) else: p.populate_set_out(relay_message, 'Mixer') server.sadd(digest, feeder_name) server.expire(digest, ttl_key) # Keep duplicate coming from different sources elif operation_mode == 2: # Filter to avoid duplicate content = server.get('HASH_'+paste_name) if content is None: # New content # Store in redis for filtering server.set('HASH_'+paste_name, digest)
message = 'Checked {} credentials found.'.format(len(all_credentials)) if all_sites: message += ' Related websites: {}'.format((', '.join(all_sites))) print(message) to_print = 'Credential;{};{};{};{};{}'.format( Item.get_source(item_id), Item.get_item_date(item_id), Item.get_item_basename(item_id), message, item_id) #num of creds above tresh, publish an alert if len(all_credentials) > criticalNumberToAlert: print("========> Found more than 10 credentials in this file : {}". format(item_id)) publisher.warning(to_print) #Send to duplicate p.populate_set_out(item_id, 'Duplicate') msg = 'infoleak:automatic-detection="credential";{}'.format( item_id) p.populate_set_out(msg, 'Tags') site_occurence = regex_helper.regex_findall( module_name, redis_cache_key, regex_site_for_stats, item_id, item_content, max_time=max_execution_time, r_set=False) creds_sites = {}
clean_card = re.sub('[^0-9]', '', card) clean_card = clean_card if lib_refine.is_luhn_valid(clean_card): print(clean_card, 'is valid') creditcard_set.add(clean_card) paste.__setattr__(channel, creditcard_set) paste.save_attribute_redis(channel, creditcard_set) pprint.pprint(creditcard_set) to_print = 'CreditCard;{};{};{};'.format( paste.p_source, paste.p_date, paste.p_name) if (len(creditcard_set) > 0): publisher.warning('{}Checked {} valid number(s);{}'.format( to_print, len(creditcard_set), paste.p_path)) print('{}Checked {} valid number(s);{}'.format( to_print, len(creditcard_set), paste.p_path)) #Send to duplicate p.populate_set_out(filename, 'Duplicate') #send to Browse_warning_paste msg = 'creditcard;{}'.format(filename) p.populate_set_out(msg, 'alertHandler') msg = 'infoleak:automatic-detection="credit-card";{}'.format(filename) p.populate_set_out(msg, 'Tags') else: publisher.info('{}CreditCard related;{}'.format(to_print, paste.p_path)) else: publisher.debug("Script creditcard is idling 1m") time.sleep(10)