def computeURLHashes(twitter_urls): twitter_urls_hashes_dict = {} num_urls = len(twitter_urls_dict) counter = 0 printCounter = 0 for url in twitter_urls: if url not in twitter_urls_hashes_dict: #url = url.encode('utf8') #url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") #url_hashes = sbl.get_hash(url) url_hashes = URL(url).hashes hash_list = [] #hash_prefix_list = [] for h in url_hashes: #hash_prefix = sqlite3.Binary(h[0:4]) #hash_prefix = str(hash_prefix).encode('hex') hash_list.append(h) #hash_prefix_list.append(hash_prefix) #twitter_urls_hashes_dict[url] = (url,hash_list,hash_prefix_list) twitter_urls_hashes_dict[url] = (hash_list) counter += 1 printCounter += 1 if (printCounter == 1000): progress_bar(counter, num_urls, '%s of %s' % (counter, num_urls)) printCounter = 0 return twitter_urls_hashes_dict
def importCertStreamURLs(limit): counter = 0 printCounter = 0 cur = con.cursor() #cur.execute('SELECT url FROM tweet_urls LIMIT 500000') #cur.execute("SELECT url,tweet_id FROM tweet_urls_4 LIMIT "+str(start)+", "+str(batch_size)+" ") #cur.execute("SELECT domain, id FROM certstream_domains_5 ORDER BY id DESC LIMIT "+str(limit)) cur.execute( "SELECT domain FROM certstream_domains_5 ORDER BY id DESC LIMIT " + str(limit)) row = cur.fetchone() domains = [] while row is not None: #if row[0] !="": formatted_domain = formatURL("http://" + row[0]) #domains.append((formatted_domain,row[1])) domains.append(formatted_domain) counter += 1 printCounter += 1 if (printCounter == 1000): progress_bar(counter, limit, '%s of %s' % (counter, limit)) printCounter = 0 row = cur.fetchone() cur.close() return domains
def importRecentTCoFiltered(limit): cur = con.cursor() cur.execute( 'SELECT full_url FROM t_co_all_urls_experiment ORDER BY id DESC LIMIT ' + str(limit)) #cursor.execute("SELECT url,tweet_id FROM tweet_urls_3 WHERE DATE(date_added) = '2017-03-20' ") #cursor.execute("SELECT url,tweet_id FROM tweet_urls_3 LIMIT "+str(start)+", "+str(batch_size)+" ") row = cur.fetchone() twitter_urls_dict = {} #twitter_urls_list = [] printCounter = 0 counter = 0 while row is not None: #url = row[0].rstrip('/') formatted_url = formatURL(row[0]) twitter_urls_dict[formatted_url] = None counter += 1 printCounter += 1 if (printCounter == 100): progress_bar(counter, limit, '%s of %s' % (counter, limit)) printCounter = 0 row = cur.fetchone() #print "twitter_urls_list len: ",len(twitter_urls_list) #print "num recent t.co urls: ",len(twitter_urls_dict) cur.close() return twitter_urls_dict
def URLLookup(twitter_urls, phishing_urls, blacklist): counter = 0 match_counter = 0 netloc_counter = 0 printCounter = 0 num_urls = len(twitter_urls) for url in twitter_urls: if url in phishing_urls: match_counter += 1 print url logPhishingURL(url, 5, twitter_urls[url], blacklist) markTweetsPhishy(url, twitter_urls[url]) url_permutations = URLPermutations(url) for url_p in url_permutations: if url_p in phishing_urls: match_counter += 1 print url_p logPhishingURL(url, 5, twitter_urls[url_p], blacklist) markTweetsPhishy(url, twitter_urls[url]) #if saveNetLoc(url): # netloc_counter += 1; counter += 1 printCounter += 1 if (printCounter == 1000): progress_bar(counter, num_urls, '%s of %s ' % (counter, num_urls)) printCounter = 0 return (match_counter, netloc_counter)
def importRecentTwitterURLs(limit, from_id=None, to_id=None): cur = con.cursor() if from_id is not None and to_id is not None: print "importing by range (" + str(from_id) + " to " + str(to_id) + ")" cur.execute('SELECT url,tweet_id FROM tweet_urls_5b WHERE id >= ' + str(from_id) + ' AND id <= ' + str(to_id)) else: print "importing recent by limit (" + str(limit) + ")" cur.execute( 'SELECT url,tweet_id FROM tweet_urls_5b ORDER BY id DESC LIMIT ' + str(limit)) row = cur.fetchone() twitter_urls_dict = {} #twitter_urls_list = [] printCounter = 0 counter = 0 while row is not None: #url = row[0].rstrip('/') formatted_url = formatURL(row[0]) #url = row[0] #url = url.encode('utf8') #url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") tweet_id = row[1] #print url #print tweet_id #twitter_urls_list.append(url) tweet_ids = {} if formatted_url not in twitter_urls_dict: #twitter_urls_dict[url] = [str(tweet_id),] tweet_ids[tweet_id] = None twitter_urls_dict[formatted_url] = tweet_ids #twitter_urls_dict[url] = url #print "not in dict: ", twitter_urls_dict[url] #print type(twitter_urls_dict[url]) elif formatted_url in twitter_urls_dict: #print "in dict: url: "+url+", ",twitter_urls_dict[url] #print type(twitter_urls_dict[url]) tweet_ids = twitter_urls_dict[formatted_url] if tweet_id not in tweet_ids: tweet_ids[tweet_id] = None twitter_urls_dict[formatted_url] = tweet_ids #print tweet_ids else: print "UNKNOWN: ", formatted_url exit() counter += 1 printCounter += 1 if (printCounter == 1000): progress_bar(counter, limit, '%s of %s' % (counter, limit)) printCounter = 0 row = cur.fetchone() #print "twitter_urls_list len: ",len(twitter_urls_list) print "twitter_urls_dict len: ", len(twitter_urls_dict) cur.close() return twitter_urls_dict
def importRedirectionChainURLs(limit): cursor = con.cursor() #print "starting at "+str(start)+", batch size "+str(batch_size) print "importing redirection chain from most recent tweets..." #cursor.execute("SELECT redirection_chain,id FROM tweets_3 LIMIT "+str(start)+", "+str(batch_size)+" ") cursor.execute( "SELECT redirection_chain,id FROM tweets_5 ORDER BY id DESC limit " + str(limit)) row = cursor.fetchone() printCounter = 0 counter = 0 twitter_urls_dict = {} while row is not None: tweet_id = row[1] if row[0] is not None: urls = row[0].split(" -> ") for url in urls[ 1:]: # remove first url in this chain as it will have already been checked by importRecentTwitterURLs formatted_url = formatURL(url) tweet_ids = {} if formatted_url not in twitter_urls_dict: #twitter_urls_dict[url] = [str(tweet_id),] tweet_ids[tweet_id] = None twitter_urls_dict[formatted_url] = tweet_ids #twitter_urls_dict[url] = url #print "not in dict: ", twitter_urls_dict[url] #print type(twitter_urls_dict[url]) elif formatted_url in twitter_urls_dict: #print "in dict: url: "+url+", ",twitter_urls_dict[url] #print type(twitter_urls_dict[url]) tweet_ids = twitter_urls_dict[formatted_url] if tweet_id not in tweet_ids: tweet_ids[tweet_id] = None twitter_urls_dict[formatted_url] = tweet_ids #print tweet_ids else: print "UNKNOWN: ", formatted_url exit() counter += 1 printCounter += 1 if (printCounter == 1000): progress_bar(counter, limit, '%s of %s' % (counter, limit)) printCounter = 0 row = cursor.fetchone() cursor.close() print "num unique urls: ", len(twitter_urls_dict) return twitter_urls_dict
def produceURLPermutations(url_dict): print "producing URL permutations..." url_permutations_dict = {} counter = 0 printCounter = 0 for url in url_dict: #print url url_permutations = URLPermutations(url) for url_p in url_permutations: #print url_p url_permutations_dict[url_p] = False counter += 1 printCounter += 1 if (printCounter == 1000): progress_bar(counter, len(url_dict), '%s of %s' % (counter, len(url_dict))) printCounter = 0 return url_permutations_dict
def importRecentTwitterURLs(limit): cursor = con.cursor() print "importing most recently tweeted urls..." cursor.execute( "SELECT url,tweet_id FROM tweet_urls_5 ORDER BY id DESC limit " + str(limit)) row = cursor.fetchone() printCounter = 0 counter = 0 twitter_urls_dict = {} while row is not None: formatted_url = formatURL(row[0]) tweet_id = row[1] tweet_ids = {} if formatted_url not in twitter_urls_dict: #twitter_urls_dict[url] = [str(tweet_id),] tweet_ids[tweet_id] = None twitter_urls_dict[formatted_url] = tweet_ids #twitter_urls_dict[url] = url #print "not in dict: ", twitter_urls_dict[url] #print type(twitter_urls_dict[url]) elif formatted_url in twitter_urls_dict: #print "in dict: url: "+url+", ",twitter_urls_dict[url] #print type(twitter_urls_dict[url]) tweet_ids = twitter_urls_dict[formatted_url] if tweet_id not in tweet_ids: tweet_ids[tweet_id] = None twitter_urls_dict[formatted_url] = tweet_ids #print tweet_ids else: print "UNKNOWN: ", formatted_url exit() counter += 1 printCounter += 1 if (printCounter == 1000): progress_bar(counter, limit, '%s of %s' % (counter, limit)) printCounter = 0 row = cursor.fetchone() cursor.close() print "num unique urls: ", len(twitter_urls_dict) return twitter_urls_dict
def importRedirectionChain(start, batch_size): cursor = con.cursor() print "import redirection chains\nstarting at "+str(start)+", batch size "+str(batch_size) cursor.execute("SELECT redirection_chain,id FROM tweets_5 LIMIT "+str(start)+", "+str(batch_size)+" ") row = cursor.fetchone() printCounter=0 counter = 0 twitter_urls_dict = {} while row is not None: if row[0] is not None: urls = row[0].split(" -> ") for url in urls: formatted_url = format_url(url) tweet_id = row[1] tweet_ids = {} if formatted_url not in twitter_urls_dict: #twitter_urls_dict[url] = [str(tweet_id),] tweet_ids[tweet_id] = None twitter_urls_dict[formatted_url] = tweet_ids #twitter_urls_dict[url] = url #print "not in dict: ", twitter_urls_dict[url] #print type(twitter_urls_dict[url]) elif formatted_url in twitter_urls_dict: #print "in dict: url: "+url+", ",twitter_urls_dict[url] #print type(twitter_urls_dict[url]) tweet_ids = twitter_urls_dict[formatted_url] if tweet_id not in tweet_ids: tweet_ids[tweet_id] = None twitter_urls_dict[formatted_url] = tweet_ids #print tweet_ids else: print "UNKNOWN: ",formatted_url exit() counter += 1 printCounter += 1 if (printCounter == 1000): progress_bar(counter, 1000000, '%s of %s' % (counter, 1000000)) printCounter = 0 row = cursor.fetchone() cursor.close() print "num unique urls: ",len(twitter_urls_dict) return twitter_urls_dict
def URLLookup_v2(twitter_urls, gsb_url_hash_prefixes, twitter_urls_dict, redirection_chain_url_lookup): # twitter_urls = (hash_list) # twitter_urls_dict = (tweet_ids) counter = 0 hash_prefix_counter = 0 hash_prefix_no_collision_counter = 0 printCounter = 0 num_urls = len(twitter_urls) malware_count = 0 phish_count = 0 url_already_checked_ctr = 0 url_matches_in_gsb = 0 gsb_lookup_counter = 0 malware_matches = {} phishing_matches = {} for url in twitter_urls: j = 0 url_hashes = twitter_urls[url] #url_hash_prefixes = url_data[2] #url_hashes = url_data[2] #for hash_prefix in url_hash_prefixes: for url_hash in url_hashes: hash_prefix = sqlite3.Binary(url_hash[0:4]) hash_prefix = str(hash_prefix).encode('hex') if hash_prefix in gsb_url_hash_prefixes: #print "num hash prefixes:",gsb_url_hash_prefixes[hash_prefix] if gsb_url_hash_prefixes[hash_prefix] == 6: hash_prefix_no_collision_counter += 1 if (lookup_gsb_full_hash(sqlite3.Binary(url_hash))): database_lock = True while database_lock: try: gsb_lookup_counter += 1 gsblookup = sbl.lookup_url(url) #gsblookup = sbl.lookup_hash(url_hash) #gsblookup = sbl._lookup_hashes(url_hashes) if gsblookup: url_matches_in_gsb += 1 cur = con.cursor() sql = "INSERT INTO gsb_full_hash_log_5(url, hash_prefix, full_hash) VALUES(%s, %s, %s)" cur.execute(sql, (url[0:500], sqlite3.Binary(url_hash[0:4]), sqlite3.Binary(url_hash))) for i in gsblookup: #print str(i) #print type(i) #if i == "goog-malware-shavar": if str(i ) == "MALWARE/ANY_PLATFORM/URL": print url cur.execute( "UPDATE gsb_full_hash_log_5 SET malware = '1' WHERE full_hash = %s", (url_hash, )) con.commit() #print "malware",url #sys.stdout.write('\r-') #sys.stdout.flush() malware_count += 1 malware_matches[url] = (url_hash) logMalwareURL( url, 4, twitter_urls_dict[url], redirection_chain_url_lookup) #markTweetsMalware(url, twitter_urls_dict[url]) #if i == "googpub-phish-shavar": if str( i ) == "SOCIAL_ENGINEERING/ANY_PLATFORM/URL": print url cur.execute( "UPDATE gsb_full_hash_log_5 SET social_engineering = '1' WHERE full_hash = %s", (url_hash, )) con.commit() #print "phishing ",url phish_count += 1 logPhishingURL( url, 4, twitter_urls_dict[url], redirection_chain_url_lookup) #markTweetsPhishy(url, twitter_urls_dict[url]) phishing_matches[url] = (url_hash) else: # url hash prefix match but full hash not in GSB i.e. different URL. Mark in db so doesn't get checked again cur = con.cursor() sql = "INSERT INTO gsb_full_hash_log_5(hash_prefix, full_hash, not_in_gsb) VALUES(%s, %s, %s)" cur.execute(sql, (sqlite3.Binary(url_hash[0:4]), sqlite3.Binary(url_hash), 1)) database_lock = False except KeyError: # 18 Jun 2018, disabling error: 'exceptions.KeyError'>, KeyError('matches',) # as appearing every minute or so print "Looks like a key error:", sys.exc_info( )[1] log( "certstream-url-checker-v2-phishing_5.txt", "Looks like a key error: " + str(sys.exc_info()[1])) print "URL:", url time.sleep(5) database_lock = False except (RuntimeError, IntegrityError, urllib2.HTTPError, urllib2.URLError, SocketError, sqlite3.OperationalError) as e: print e print url log( "tpl_fast_v2-output.txt", "error: " + str(e.message) + "\nURL: " + url + "\n") print "waiting 5 seconds..." time.sleep(5) database_lock = False except sqlite3.OperationalError: print("database locked, waiting 5 seconds...") log("tpl_v3.2-output.txt", "gglsbl3 database is locked") time.sleep(5) else: #print "URL already in gsb_full_hash_lookup" url_already_checked_ctr += 1 #else: #hash prefix not unique #still log URL anyway, but mark that it's not only URL hash_prefix_counter += 1 #gglsbl_db = "/tmp/gsb_v4.db" #sql_db = sqlite3.connect(gglsbl_db) #cursor = sql_db.cursor() #cursor.execute("SELECT threat_type, platform_type, threat_entry_type from hash_prefix WHERE value = ?",(sqlite3.Binary(url_hash[0:4]),) ) #get all hash prefixes #cursor.execute('''SELECT value from full_hash''') #get all full hashes #results = cursor.fetchall() #for r in results: # print r[0]+" "+r[1]+" "+r[2] # database_lock = True # while database_lock: # try: # gsblookup = sbl.lookup_hash(url_hash) # #gsblookup = sbl._lookup_hashes(url_hashes) # if gsblookup: # for i in gsblookup: # #print str(i) # #print type(i) # if i == "goog-malware-shavar": # #if str(i) == "MALWARE/ANY_PLATFORM/URL": # #print "malware",url # #sys.stdout.write('\r-') # #sys.stdout.flush() # malware_count += 1 # malware_matches[url] = (url_hash) # logMalwareURL(url, 4, twitter_urls_dict[url], redirection_chain_url_lookup) # markTweetsMalware(url, twitter_urls_dict[url]) # if i == "googpub-phish-shavar": # #if str(i) == "SOCIAL_ENGINEERING/ANY_PLATFORM/URL": # #print "phishing ",url # phish_count += 1 # logPhishingURL(url, 4, twitter_urls_dict[url], redirection_chain_url_lookup) # markTweetsPhishy(url, twitter_urls_dict[url]) # phishing_matches[url] = (url_hash) # database_lock = False # except (RuntimeError, IntegrityError, urllib2.HTTPError, urllib2.URLError, SocketError, sqlite3.OperationalError) as e: # print e # print url # log("tpl_fast_v2-output.txt", "error: "+str(e.message)+"\nURL: "+url+"\n") # print "waiting 5 seconds..." # time.sleep(5) # database_lock = False # except sqlite3.OperationalError: # print("database locked, waiting 5 seconds...") # log("tpl_v3.2-output.txt", "gglsbl3 database is locked") # time.sleep(5) j += 1 counter += 1 printCounter += 1 if (printCounter == 1000): #progress_bar(counter, num_urls, '%s of %s (hash_prefix_counter: %s)' % (counter, num_urls, hash_prefix_counter)) progress_bar(counter, num_urls, '%s of %s' % (counter, num_urls)) printCounter = 0 #print "num hash prefix matches: ", hash_prefix_counter return (phish_count, malware_count, phishing_matches, malware_matches, hash_prefix_counter, hash_prefix_no_collision_counter, url_already_checked_ctr, url_matches_in_gsb, gsb_lookup_counter)
def URLLookup_v2(domain_hashes, gsb_urls): # twitter_urls = (hash_list) # twitter_urls_dict = (tweet_ids) phish_count = 0 malware_count = 0 phishing_matches = {} counter = 0 hash_prefix_counter = 0 printCounter = 0 num_urls = len(domain_hashes) malware_domains = [] phishing_domains = [] for url in domain_hashes: #print url url_hashes = domain_hashes[url] for url_hash in url_hashes: #print url_hash hash_prefix = sqlite3.Binary(url_hash[0:4]) hash_prefix = str(hash_prefix).encode('hex') if hash_prefix in gsb_urls: if (lookup_gsb_full_hash(sqlite3.Binary(url_hash))): hash_prefix_counter += 1 database_lock = True while database_lock: try: print "url hash prefix match!" #gsblookup = sbl.lookup_hash(url_hash) gsblookup = sbl.lookup_url(url) if gsblookup: cur = con.cursor() sql = "INSERT INTO gsb_full_hash_log_certstream_5(url, hash_prefix, full_hash) VALUES(%s, %s, %s)" cur.execute( sql, (url[0:500], sqlite3.Binary(url_hash[0:4]), sqlite3.Binary(url_hash))) print "url full hash match!" for i in gsblookup: print i if str(i) == "MALWARE/ANY_PLATFORM/URL": cur.execute( "UPDATE gsb_full_hash_log_certstream_5 SET malware = '1' WHERE full_hash = %s", (url_hash, )) con.commit() print "malware" malware_count += 1 #sys.stdout.write('\r-') #sys.stdout.flush() #malware_domains.append() if str( i ) == "SOCIAL_ENGINEERING/ANY_PLATFORM/URL": cur.execute( "UPDATE gsb_full_hash_log_certstream_5 SET social_engineering = '1' WHERE full_hash = %s", (url_hash, )) con.commit() print "phishing " #,url_data[0] phish_count += 1 #logPhishingURL(url, 3, twitter_urls_dict[url], redirection_chain_url_lookup) #markTweetsPhishy(url, twitter_urls_dict[url]) phishing_matches[url] = (url_hash) else: # url hash prefix match but full hash not in GSB i.e. different URL. Mark in db so doesn't get checked again cur = con.cursor() sql = "INSERT INTO gsb_full_hash_log_certstream_5(hash_prefix, full_hash, not_in_gsb) VALUES(%s, %s, %s)" cur.execute(sql, (sqlite3.Binary(url_hash[0:4]), sqlite3.Binary(url_hash), 1)) database_lock = False except (RuntimeError, IntegrityError, urllib2.HTTPError, urllib2.URLError, SocketError) as e: print e print url log( "tpl_v3.2-output.txt", "error: " + str(e.message) + "\nURL: " + url + "\n") print "waiting 5 seconds..." time.sleep(5) database_lock = False except sqlite3.OperationalError: print("database locked, waiting 5 seconds...") log("tpl_v3.2-output.txt", "gglsbl3 database is locked") time.sleep(5) database_lock = False except KeyError: # 18 Jun 2018, disabling error: 'exceptions.KeyError'>, KeyError('matches',) # as appearing every minute or so print "Looks like a key error:", sys.exc_info()[1] log( "certstream-url-checker-v2-phishing_5.txt", "Looks like a key error: " + str(sys.exc_info()[1])) print "URL:", url time.sleep(5) database_lock = False except: print "We have an error:", sys.exc_info()[1] print sys.exc_info() import os import traceback from send_email import sendAdminAlert script_file_name = os.path.basename(__file__) error_string = "" for frame in traceback.extract_tb( sys.exc_info()[2]): fname, lineno, fn, text = frame error_string += "\nError in %s on line %d" % ( fname, lineno) print error_string sendAdminAlert( "Error (within URLLookup_v2 loop) in " + script_file_name, "Python script: " + script_file_name + "\nError reprted: " + str(sys.exc_info()[1]) + "\nLine:" + str(error_string)) print "waiting 90 seconds" time.sleep(90) database_lock = False else: print "URL already in gsb_full_hash_lookup" #j+=1 counter += 1 printCounter += 1 if (printCounter == 10): #progress_bar(counter, num_urls, '%s of %s (hash_prefix_counter: %s)' % (counter, num_urls, hash_prefix_counter)) progress_bar(counter, num_urls, '%s of %s' % (counter, num_urls)) printCounter = 0 return (phish_count, malware_count, phishing_matches)
#print url #print timestamp_matches_list[0] #print lookupURLFullHash(full_url_hash) cur.execute( "UPDATE spam_urls_5 SET gglsbl_timestamp = %s WHERE url = %s AND gglsbl_timestamp IS NULL", (timestamp_matches_list[0], url)) con.commit() elif len(timestamp_matches_dict) > 1: multiple_counter += 1 elif len(timestamp_matches_dict) == 0: zero_match_counter += 1 counter += 1 printCounter += 1 if (printCounter == 10): progress_bar(counter, len(spam_urls), '%s of %s' % (counter, len(spam_urls))) printCounter = 0 print "\nURLs checked:", len(spam_urls) print "success:", success_counter print "zero matches:", zero_match_counter print "multi TSs:", multiple_counter conn.close() except: print "We have an error:", sys.exc_info()[1] import os import sys import traceback from send_email import sendAdminAlert