Exemplo n.º 1
0
def update_hash_prefix_cache():
    active = get_active()
    if active and active['ctime'] and active['mtime'] and min(
            active['ctime'], active['mtime']) >= (time.time() - (30 * 60)):
        # no need to update, active DB exists and is recent
        logger.info('active database is fresh')
        inactive = get_inactive()
        # remove inactivate database if it exists to free up disk space
        remove_inactive(inactive)
    else:
        # we need to update the inactive DB, so get its info and delete it
        inactive = get_inactive()
        remove_inactive(inactive)

        # download to temporary file name
        tmp_file = inactive['name'] + '.tmp'
        logger.info('downloading database to ' + tmp_file)
        sbl = SafeBrowsingList(gsb_api_key, tmp_file, True)
        sbl.update_hash_prefix_cache()
        logger.info("finished creating " + tmp_file)

        # rename to inactive file name
        if path.isfile(tmp_file + JOURNAL):
            rename(tmp_file + JOURNAL, inactive['name'] + JOURNAL)
            logger.info("renamed " + tmp_file + JOURNAL + ' to ' +
                        inactive['name'] + JOURNAL)
        rename(tmp_file, inactive['name'])
        logger.info("renamed " + tmp_file + ' to ' + inactive['name'])
Exemplo n.º 2
0
def update_hash_prefix_cache():
    logger.info('opening database at ' + dbfile)
    sbl = SafeBrowsingList(gsb_api_key, dbfile, True)

    logger.info('updating database at ' + dbfile)
    sbl.update_hash_prefix_cache()

    logger.info('checkpointing database at ' + dbfile)
    with sbl.storage.get_cursor() as dbc:
        dbc.execute('PRAGMA wal_checkpoint(FULL)')
    sbl.storage.db.commit()

    logger.info("all done!")
Exemplo n.º 3
0
class SafeBrowsing(object):
    TYPE = "GoogleSBCheck"
    CP_FMT = '{scheme}://{netloc}/{path}'

    def __init__(self,
                 name=None,
                 api_key=None,
                 db_path='/tmp/gsb_4.db',
                 update_hash_prefix_cache=False):
        self.api_key = api_key
        self.db_path = db_path

        self.sbl = SafeBrowsingList(api_key, db_path=db_path)
        self.update_hash_prefix_cache = update_hash_prefix_cache
        try:
            os.stat(db_path)
        except:
            self.update_hash_prefix_cache = True

        if self.update_hash_prefix_cache:
            # this may take a while so be patient (over 1600MB of data)
            self.sbl.update_hash_prefix_cache()

    def is_blacklisted(self, url):
        return not SafeBrowsing.thread_safe_lookup(url) is None

    def lookup_url(self, url):
        up = urlparse(url)
        cp = self.CP_FMT.format(**{
            'scheme': up.scheme,
            'netloc': up.netloc,
            'path': up.path
        }).strip('/') + '/'
        return self.sbl.lookup_url(cp)

    def handle_domain(self, domain):
        return self.handle_domains([
            domain,
        ])

    def handle_domains(self, domains):
        results = {}
        for domain in domains:
            t = "https://" + domain
            u = "http://" + domain
            results[domain] = False
            if self.lookup_url(t) or self.lookup_url(u):
                results[domain] = True
                continue
        return results
Exemplo n.º 4
0
class SafeBrowsing(object):
    def __init__(self,
                 api_key,
                 db_path=LINUX_DFT_PATH,
                 update_hash_prefix_cache=False):
        global API_KEY, DB_PATH
        API_KEY = api_key
        DB_PATH = db_path

        self.sbl = SafeBrowsingList(api_key, db_path=db_path)
        self.update_hash_prefix_cache = update_hash_prefix_cache
        try:
            os.stat(db_path)
        except:
            self.update_hash_prefix_cache = True

        if self.update_hash_prefix_cache:
            # this may take a while so be patient (over 1600MB of data)
            self.sbl.update_hash_prefix_cache()

    def is_blacklisted(self, url):
        return not SafeBrowsing.thread_safe_lookup(url) is None

    def lookup_url(self, url):
        # cp_fmt = '{scheme}://{netloc}/{path}'
        # up = URLPARSE(url)
        # cp = cp_fmt.format(**{'scheme':up.scheme, 'netloc':up.netloc, 'path':up.path}).strip('/')+'/'
        return self.sbl.lookup_url(url)

    @classmethod
    def init(cls, api_key):
        return SafeBrowsing(api_key)

    @staticmethod
    def set_global(api_key, db_path='/tmp/gsb_4.db'):
        global SB_CHECK, API_KEY, DB_PATH
        API_KEY = api_key, db_path
        SB_CHECK = SafeBrowsing(api_key, db_path=db_path)

    @staticmethod
    def thread_safe_lookup(url):
        global SB_CHECK
        sbl = SafeBrowsing(API_KEY, db_path=DB_PATH)
        return sbl.lookup_url(url)
Exemplo n.º 5
0
class Security:
    def __init__(self):
        self.sbl = SafeBrowsingList(GoogleConfig.SAFEBROWSINGAPIKEY)
        self.sbl.update_hash_prefix_cache()
        pass

    def validate_referer(self, url):
        threat_list = self.sbl.lookup_url(url)
        if threat_list is None:
            return None
        return threat_list

    def get_referer(self):
        referer = request.referrer
        if not referer:
            return None
        return referer

    @staticmethod
    def is_safe_url(url):
        ref_url = urlparse(request.host_url)
        test_url = urlparse(urljoin(request.host_url, url))
        return test_url.scheme in ('http', 'https') and \
               ref_url.netloc == test_url.netloc
Exemplo n.º 6
0
	mainStart = time.time()

	
	#update GSB dataset
	start = time.time()
	print "Updating local GSB dataset..."
	print datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	try:

		sbl = SafeBrowsingList(config['gsb-api']['key'])	

		con = mdb.connect(config['mysql']['host'], config['mysql']['username'], config['mysql']['password'], config['mysql']['database_5'], charset='utf8')
		con.autocommit(True)
		cur = con.cursor()

		sbl.update_hash_prefix_cache()
		#hash_prefixes = sbl.get_all_hash_prefixes() #from my modified version
		
		gglsbl_db = "/tmp/gsb_v4.db"
		sql_db = sqlite3.connect(gglsbl_db)
		cursor = sql_db.cursor()
		cursor.execute('''SELECT HEX(value) from hash_prefix''') #get all hash prefixes
		#cursor.execute('''SELECT value from full_hash''') #get all full hashes
		all_rows = cursor.fetchall()
		
		gsb_url_hash_prefix_dict = {}
		for url_hash_prefix in all_rows:
			gsb_url_hash_prefix_dict[url_hash_prefix] = True
		
		sql = "INSERT INTO gsb_update_log_5(num_urls_gglsbl4, num_unique_urls_gglsbl4) VALUES(%s, %s)"
		cur.execute(sql, (len(all_rows), len(gsb_url_hash_prefix_dict)))
Exemplo n.º 7
0
    def worker(self, id, queue):

        with open(r'config\gglsbl.auth', 'r') as auth_file:
            gglsbl_key = auth_file.read().strip()

        sbl = SafeBrowsingList(gglsbl_key,
                               db_path=r"dataset\google_safe_browisng_db")
        # sbl.update_hash_prefix_cache()

        turn = True
        while True:

            # Update Google SBL database every 12 hours at time X (e.g. 3 AM and 3 PM)
            hour = datetime.datetime.today().hour
            if hour % 12 == 3 and turn:
                sbl.update_hash_prefix_cache()
                turn = False
            elif hour % 12 != 3:
                turn = True

            today = get_date()
            with open(os.path.join('results', today + '.ioc.csv'),
                      'a+',
                      encoding='utf_8') as output_file:
                tweet = queue.get()
                try:
                    if hasattr(tweet, 'retweeted_status') and hasattr(
                            tweet.retweeted_status, 'extended_tweet'
                    ) and 'full_text' in tweet.retweeted_status.extended_tweet:
                        text = tweet.retweeted_status.extended_tweet[
                            'full_text']
                    elif hasattr(tweet, 'extended_tweet'
                                 ) and 'full_text' in tweet.extended_tweet:
                        text = tweet.extended_tweet['full_text']
                    elif not hasattr(tweet, 'text'):
                        text = tweet['text']
                    else:
                        text = tweet.text

                    if hasattr(tweet, 'retweeted_status'):
                        if hasattr(tweet.retweeted_status, 'extended_tweet'):
                            final_urls = tweet.retweeted_status.extended_tweet[
                                'entities']['urls']
                        else:
                            final_urls = tweet.retweeted_status.entities[
                                'urls']
                    else:
                        if hasattr(tweet, 'extended_tweet'):
                            final_urls = tweet.extended_tweet['entities'][
                                'urls']
                        else:
                            final_urls = tweet.entities['urls']

                    for final_url in final_urls:
                        # If a pastebin URL, get the raw content and append it to the tweet content
                        if final_url['expanded_url'].startswith(
                                'https://pastebin.com/'):
                            pastebin = final_url['expanded_url']
                            if 'raw' not in pastebin:
                                pastebin = pastebin.replace(
                                    'https://pastebin.com/',
                                    'https://pastebin.com/raw/')

                            req = requests.get(pastebin)
                            text += '\n' + req.content

                    user_type = 'top'
                    if tweet.user.id_str in self.rand_users:
                        user_type = 'rand'

                    print(
                        "###########################$$$$$$$$$$$$$$$$$$$$$$$$$$$"
                    )
                    print(text)

                    # classifier must be retrained with new data
                    # vector = vectorize(text, self.wordlist)
                    # vector.append(len(tweet.entities['hashtags']))
                    # vector.append(len(tweet.entities['user_mentions']))
                    # vector = numpy.array(vector).reshape(1, -1)
                    # estimates = []
                    # for i in range(number_of_classifiers):
                    #     y_estimate = self.classifiers[i].predict(vector)
                    #     estimates.append(y_estimate)
                    # vote = statistics.mode([x[0] for x in estimates])
                    # print("Prediction: "+vote)

                    ips = list(iocextract.extract_ips(text, refang=True))
                    for ip in ips:
                        if ip not in text:
                            output_file.write('{},{},{},{},{},ip,{}\n'.format(
                                tweet.id, tweet.created_at, user_type,
                                tweet.user.id_str, tweet.user.screen_name, ip))

                    urls = list(iocextract.extract_urls(text, refang=True))
                    for url in urls:
                        if url not in text:
                            result = sbl.lookup_url(url.rstrip('.'))
                            if result is not None:
                                output_file.write(
                                    '{},{},{},{},{},url,{},{}\n'.format(
                                        tweet.id, tweet.created_at, user_type,
                                        tweet.user.id_str,
                                        tweet.user.screen_name,
                                        url.rstrip('.'), result))
                            else:
                                output_file.write(
                                    '{},{},{},{},{},url,{},benign\n'.format(
                                        tweet.id, tweet.created_at, user_type,
                                        tweet.user.id_str,
                                        tweet.user.screen_name,
                                        url.rstrip('.')))

                    emails = list(iocextract.extract_emails(text, refang=True))
                    for email in emails:
                        if email not in text:
                            output_file.write(
                                '{},{},{},{},{},email,{}\n'.format(
                                    tweet.id, tweet.created_at, user_type,
                                    tweet.user.id_str, tweet.user.screen_name,
                                    email))
                    hashes = list(iocextract.extract_hashes(text))
                    for hash in hashes:
                        output_file.write('{},{},{},{},{},hash,{}\n'.format(
                            tweet.id, tweet.created_at, user_type,
                            tweet.user.id_str, tweet.user.screen_name, hash))
                except Exception as exp:
                    print(exp)

                queue.task_done()
Exemplo n.º 8
0
class URLMonitor(Plugin):

    blacklist = []
    moderators = []
    sbl = None

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.moderators = self.plugin_config['MODERATORS']

        # Initialize Safe Browsing API
        if self.plugin_config['GOOGLE_SAFE_BROWSING']:
            self.sbl = SafeBrowsingList(
                self.plugin_config['GOOGLE_SAFE_BROWSING_API_KEY'])
            self.sbl.update_hash_prefix_cache()

        # Populate Blacklist from URLS
        for url in self.plugin_config['BLACKLISTS']:
            url = url.strip()
            if url.endswith('.json'):
                r = requests.get(url)
                # Assuming MEW List format
                for item in r.json():
                    self.blacklist.append(item['id'])

            elif url.endswidth('.csv'):
                print('csv not implemented')  # TODO
            else:
                print('txt not implement')  # TODO

        print(self.__class__.__name__, 'initialized')

    def process_message(self, data):
        # print(data)
        # Private (Groups) or Public Channels
        if chan.startswith('C') or chan.startswith('G'):
            chan = data['channel']
            text = data['text']

            # Find all URLS in message text, extract host and compare against blacklist and Google Safebrowsing
            urls = re.findall(
                'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
                text)

            def alert(url):
                # TODO flag user
                # TODO early warning system
                self.slack_client.api_call(
                    'chat.postMessage',
                    channel=self.plugin_config['MODERATE_CHAN'],
                    ' '.join(self.moderators) + ' ' +
                    text)  # TODO can probably use outputs for this
                if len(self.plugin_config.WARNING_MESSAGE):
                    self.outputs.append(
                        [data['channel'], self.plugin_config.WARNING_MESSAGE])

            for u in urls:
                o = urlparse(u)
                host = re.split(":\d{,4}", o.netloc)[0]

                # Check Blacklist
                if host in self.blacklist:
                    alert(u)
                    break
                # Check Google Safebrowsing
                elif sbl.lookup_url(u):
                    alert(u)
                    break
Exemplo n.º 9
0
 def updateCache():
     sbl = SafeBrowsingList(GOOGLE_SAFEBROWSE_API_KEY ,  db_path="/opt/crawler/gsb_v3.db")
     sbl.update_hash_prefix_cache()