示例#1
0
 def setUp(self):
     self.dirs_to_remove = []
     self.db_conn = dbu.mysql_init_db('fp_detective_test')
     self.domainInfo = lp.DomainInfo() # create a new DomainInfo obj for tests
     
     self.domainInfo.rank = 1
     self.domainInfo.log_filename = '/var/log/syslog'
     self.domainInfo.url = 'http://google.com'
     self.domainInfo.fonts_loaded = ['Arial', 'Tahoma', 'Georgia', '微软雅黑']
     self.domainInfo.fonts_by_origins = {'http://google.com':['arial', 'Tahoma'], 'http://yahoo.com':['Georgia'] }
     self.domainInfo.requests = ['http://google.com', 'http://yahoo.com']
     self.domainInfo.responses = ['http://abc.com', 'http://xyz.com']
     self.domainInfo.num_font_loads = 50
     self.domainInfo.num_offsetWidth_calls = 15
     self.domainInfo.num_offsetHeight_calls = 15
     self.domainInfo.fp_detected = ['iesnare', 'bluecava']
     self.domainInfo.crawl_id = 64654
     self.domainInfo.fpd_logs = ['userAgent', 'appCodeName']
     self.domainInfo.fc_dbg_font_loads = ['Arial', 'Tahoma', 'Georgia', 'someotherfont', '微软雅黑']
     self.domainInfo.log_complete = 1
     
     ha = ag.HeadlessAgent()
     self.crawl_job = ag.CrawlJob(ha)
     self.dirs_to_remove.append(self.crawl_job.job_dir)
     self.crawl_job.urls = ['http://google.com', 'http://yahoo.com']
     self.crawl_job.desc
示例#2
0
    def should_crawl_and_find_swfs(self, url, expected_strings=(None, None), unexpected_strings=(None, None), crawler_type='chrome_lazy'):
        expected_fields, expected_values = expected_strings
        unexpected_fields, unexpected_values = unexpected_strings
        
        crawl_id = ag.crawl_sites([(1, url),], crawler_type, num_crawl_urls=1)
        
        db_conn = dbu.mysql_init_db()
        db_curs = db_conn.cursor(mdb.cursors.DictCursor)
        
        rows = swu.get_swf_obj_from_db('crawl_id', int(crawl_id), db_curs)
        
        if expected_fields:
            found_dict = {}
            for field in expected_fields:
                found_dict[field] = False
        
        for row in rows:
            if expected_fields:
                for expected_field, expected_value in zip(expected_fields, expected_values):
                    if expected_value in row[expected_field]:
                        found_dict[expected_field] = True
                        print 'found in ',  row[expected_field]
                    
            if unexpected_values:
                for unexpected_field, unexpected_value in zip(unexpected_fields, unexpected_values) :
                    if unexpected_value in row[unexpected_field]:
                        self.fail('Unexpected field %s with unexpected value %s found' %(unexpected_field, unexpected_value))                    
        if expected_fields:
            for field, found in found_dict.iteritems():
                if not found:
                    self.fail('Cannot find %s' % field)

        db_curs.close()
        db_conn.close()
 def setUp(self):
     self.dirs_to_remove = []
     self.db_conn = dbu.mysql_init_db('fp_detective_test')
     self.domainInfo = lp.DomainInfo() # create a new DomainInfo obj for tests
     
     self.domainInfo.rank = 1
     self.domainInfo.log_filename = '/var/log/syslog'
     self.domainInfo.url = 'http://google.com'
     self.domainInfo.fonts_loaded = ['Arial', 'Tahoma', 'Georgia', '微软雅黑']
     self.domainInfo.fonts_by_origins = {'http://google.com':['arial', 'Tahoma'], 'http://yahoo.com':['Georgia'] }
     self.domainInfo.requests = ['http://google.com', 'http://yahoo.com']
     self.domainInfo.responses = ['http://abc.com', 'http://xyz.com']
     self.domainInfo.num_font_loads = 50
     self.domainInfo.num_offsetWidth_calls = 15
     self.domainInfo.num_offsetHeight_calls = 15
     self.domainInfo.fp_detected = [fpr.FINGERPRINTER_REGEX.items()[:2]]
     self.domainInfo.crawl_id = 64654
     self.domainInfo.fpd_logs = ['userAgent', 'appCodeName']
     self.domainInfo.fc_dbg_font_loads = ['Arial', 'Tahoma', 'Georgia', 'someotherfont', '微软雅黑']
     self.domainInfo.log_complete = 1
     
     ha = ag.HeadlessAgent()
     self.crawl_job = ag.CrawlJob(ha)
     self.dirs_to_remove.append(self.crawl_job.job_dir)
     self.crawl_job.urls = ['http://google.com', 'http://yahoo.com']
     self.crawl_job.desc
示例#4
0
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'):
    
    referer = msg.request.headers['Referer'][0] if msg.request.headers['Referer'] else ""
    
    if msg.response and msg.response.content:
        print msg.request.get_url()
        if (msg.response.content[:3] in SWF_MAGIC_NUMBERS): # to wide, but decompiler will discard them
            
            swf_hash = ut.hash_text(msg.response.content)
            swf_url = msg.request.get_url()
            
            db_conn = dbu.mysql_init_db()
            db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor)
            rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor)
            
            if not rows:
                swf_filename = os.path.join(dir_path, "%s-%s" % (prefix, msg.request.path.split('/')[-1]))
                swf_filename = swf_filename[:MAX_FILENAME_LEN]
                if not swf_filename.endswith('.swf'):
                    swf_filename += '.swf'
                    
                wl_log.info("SWF saved %s referrer: %s" % (os.path.basename(swf_filename), referer))
                
                fu.write_to_file(swf_filename, msg.response.content)
                vector = swu.get_occurence_vector_from_swf(swf_filename, os.path.join(dir_path, prefix))
                duplicate_swf = 0
            else:
                wl_log.info("A swf with same hash exists in DB: %s %s" % (swf_hash, swf_url))
                vector = swu.str_to_vector(rows[0]['occ_vector'])
                swf_filename = rows[0]['local_path']
                duplicate_swf = 1
            
            rank, domain = prefix.rsplit('/')[-1].split('-', 1)
            swf_info = swu.SwfInfo()
            
            swf_info.rank = rank # this might be fake
            swf_info.domain = domain
            swf_info.local_path = swf_filename
            swf_info.occ_vector = vector
            swf_info.hash = swf_hash
            swf_info.url = swf_url
            swf_info.referer = referer        
            swf_info.duplicate = duplicate_swf # !!! Y for repeated swfs(that we know before) 
            swf_info.feat_vector = []
            swf_info.page_url = ''
            swf_info.occ_string = ' '.join(swu.human_readable_occ_vector(vector))
            swf_info.crawl_id = crawl_id
            
            swu.add_swf_to_db(swf_info, db_conn)
            db_conn.commit()
            db_cursor.close()
            db_conn.close()
            
            
        elif '.swf' in msg.request.path:
            wl_log.warning(".swf in path but content seems non-swf %s %s" % (msg.request.path, msg.response.content[:100]))
        else:
            pass
示例#5
0
def parse_mitm_dump(basename, worker, crawl_id):
    dumpfile = basename + '.dmp'
    wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id))
    requests = []
    responses = []
    if os.path.isfile(dumpfile):
        fr = flow.FlowReader(open(dumpfile))
        try:
            for msg in fr.stream():
                requests.append(msg.request.get_url())
                # responses.append(msg.response.get_url())
                worker(
                    msg, crawl_id
                )  # this worker func should take care of db insertion, logging etc.
        except flow.FlowReadError as exc:
            pass
            #wl_log.critical("Error reading mitm dump %s" % exc)
    else:
        wl_log.critical("Cannot find mitm dump %s" % dumpfile)

    doma_info = lp.DomainInfo()
    doma_info.requests = requests
    doma_info.responses = responses
    doma_info.crawl_id = crawl_id
    doma_info.url = ""
    doma_info.fc_dbg_font_loads = []
    doma_info.fp_detected = lp.get_fp_from_reqs(requests)
    doma_info.log_complete = 1
    print os.path.basename(dumpfile[:-4]).split('-')[0]
    doma_info.rank = int(
        os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(doma_info, db_conn)

    # parse
    log_file = basename + '.txt'
    if not os.path.isfile(log_file):
        log_file = basename + '.' + MITM_LOG_EXTENSION

    insert_js_fun = functools.partial(lp.insert_js_info_to_db,
                                      site_info_id=site_info_id,
                                      db_conn=db_conn)
    lp.parse_crawl_log(log_file, insert_js_fun,
                       crawl_id)  # parse log, insert js info to db

    db_conn.commit()
    db_conn.close()
    wl_log.info("Parsed %s OK" % (dumpfile))
    if REMOVE_DMP_FILES:
        os.remove(dumpfile)
示例#6
0
 def __init__(self, agent):
     self.job_dir = create_job_folder()
     self.desc = ''
     self.urls = []
     self.url_tuples = []
     self.num_crawl_urls = 0
     self.max_parallel_procs = MAX_PARALLEL_PROCESSES
     self.crawl_agent = agent
     self.crawl_agent.crawl_job = self
     self.crawl_agent.job_dir = self.job_dir # for passing to multiprocessing worker - should find a better way
     self.index_html_log = os.path.join(self.crawl_agent.job_dir, 'index.html')
     self.db_conn = dbu.mysql_init_db('fp_detective')
     self.crawl_id = dbu.add_crawl_job_to_db(self, self.db_conn)
     self.crawl_agent.crawl_id = self.crawl_id
示例#7
0
 def __init__(self, agent):
     self.job_dir = create_job_folder()
     self.desc = ''
     self.urls = []
     self.url_tuples = []
     self.num_crawl_urls = 0
     self.max_parallel_procs = MAX_PARALLEL_PROCESSES
     self.crawl_agent = agent
     self.crawl_agent.crawl_job = self
     self.crawl_agent.job_dir = self.job_dir # for passing to multiprocessing worker - should find a better way
     self.index_html_log = os.path.join(self.crawl_agent.job_dir, 'index.html')
     self.db_conn = dbu.mysql_init_db('fp_detective')
     self.crawl_id = dbu.add_crawl_job_to_db(self, self.db_conn)
     self.crawl_agent.crawl_id = self.crawl_id
示例#8
0
def parse_mitm_dump(basename, worker, crawl_id):
    dumpfile = basename +'.dmp'
    wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id))
    requests = []
    responses = []
    if os.path.isfile(dumpfile):
        fr = flow.FlowReader(open(dumpfile))
        try: 
            for msg in fr.stream():
                requests.append(msg.request.get_url())
                # responses.append(msg.response.get_url())
                worker(msg, crawl_id) # this worker func should take care of db insertion, logging etc.
        except flow.FlowReadError as _:
            pass
            #wl_log.critical("Error reading mitm dump %s" % exc)
    else:
        wl_log.critical("Cannot find mitm dump %s" % dumpfile)
    
    doma_info = lp.DomainInfo()
    doma_info.requests = requests
    doma_info.responses = responses
    doma_info.crawl_id = crawl_id
    doma_info.url = ""
    doma_info.fc_dbg_font_loads = []
    doma_info.fp_detected = lp.get_fp_from_reqs(requests)
    doma_info.log_complete = 1
    print os.path.basename(dumpfile[:-4]).split('-')[0]
    doma_info.rank = int(os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(doma_info, db_conn)
    
    # parse 
    log_file = basename + '.txt'
    if not os.path.isfile(log_file):
        log_file = basename + '.' + MITM_LOG_EXTENSION
        
    insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn)
    lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db

    db_conn.commit()
    db_conn.close()
    wl_log.info("Parsed %s OK" % (dumpfile))
    if REMOVE_DMP_FILES:
        os.remove(dumpfile)
示例#9
0
    def should_crawl_and_find_swfs(self,
                                   url,
                                   expected_strings=(None, None),
                                   unexpected_strings=(None, None),
                                   crawler_type='chrome_lazy'):
        expected_fields, expected_values = expected_strings
        unexpected_fields, unexpected_values = unexpected_strings

        crawl_id = ag.crawl_sites([
            (1, url),
        ], crawler_type, num_crawl_urls=1)

        db_conn = dbu.mysql_init_db()
        db_curs = db_conn.cursor(mdb.cursors.DictCursor)

        rows = swu.get_swf_obj_from_db('crawl_id', int(crawl_id), db_curs)

        if expected_fields:
            found_dict = {}
            for field in expected_fields:
                found_dict[field] = False

        for row in rows:
            if expected_fields:
                for expected_field, expected_value in zip(
                        expected_fields, expected_values):
                    if expected_value in row[expected_field]:
                        found_dict[expected_field] = True
                        print 'found in ', row[expected_field]

            if unexpected_values:
                for unexpected_field, unexpected_value in zip(
                        unexpected_fields, unexpected_values):
                    if unexpected_value in row[unexpected_field]:
                        self.fail(
                            'Unexpected field %s with unexpected value %s found'
                            % (unexpected_field, unexpected_value))
        if expected_fields:
            for field, found in found_dict.iteritems():
                if not found:
                    self.fail('Cannot find %s' % field)

        db_curs.close()
        db_conn.close()
示例#10
0
 def test_mysql_init_db(self):
     db_conn = dbu.mysql_init_db()
     self.assertTrue(db_conn, "Cannot initialize connection.")
示例#11
0
def store_swfs(msg, crawl_id, dir_path='/tmp', prefix='?'):

    referer = msg.request.headers['Referer'][0] if msg.request.headers[
        'Referer'] else ""

    if msg.response and msg.response.content:
        print msg.request.get_url()
        if (msg.response.content[:3] in SWF_MAGIC_NUMBERS
            ):  # to wide, but decompiler will discard them

            swf_hash = ut.hash_text(msg.response.content)
            swf_url = msg.request.get_url()

            db_conn = dbu.mysql_init_db()
            db_cursor = db_conn.cursor(dbu.mdb.cursors.DictCursor)
            rows = swu.get_swf_obj_from_db('hash', swf_hash, db_cursor)

            if not rows:
                swf_filename = os.path.join(
                    dir_path,
                    "%s-%s" % (prefix, msg.request.path.split('/')[-1]))
                swf_filename = swf_filename[:MAX_FILENAME_LEN]
                if not swf_filename.endswith('.swf'):
                    swf_filename += '.swf'

                wl_log.info("SWF saved %s referrer: %s" %
                            (os.path.basename(swf_filename), referer))

                fu.write_to_file(swf_filename, msg.response.content)
                vector = swu.get_occurence_vector_from_swf(
                    swf_filename, os.path.join(dir_path, prefix))
                duplicate_swf = 0
            else:
                wl_log.info("A swf with same hash exists in DB: %s %s" %
                            (swf_hash, swf_url))
                vector = swu.str_to_vector(rows[0]['occ_vector'])
                swf_filename = rows[0]['local_path']
                duplicate_swf = 1

            rank, domain = prefix.rsplit('/')[-1].split('-', 1)
            swf_info = swu.SwfInfo()

            swf_info.rank = rank  # this might be fake
            swf_info.domain = domain
            swf_info.local_path = swf_filename
            swf_info.occ_vector = vector
            swf_info.hash = swf_hash
            swf_info.url = swf_url
            swf_info.referer = referer
            swf_info.duplicate = duplicate_swf  # !!! Y for repeated swfs(that we know before)
            swf_info.feat_vector = []
            swf_info.page_url = ''
            swf_info.occ_string = ' '.join(
                swu.human_readable_occ_vector(vector))
            swf_info.crawl_id = crawl_id

            swu.add_swf_to_db(swf_info, db_conn)
            db_conn.commit()
            db_cursor.close()
            db_conn.close()

        elif '.swf' in msg.request.path:
            wl_log.warning(".swf in path but content seems non-swf %s %s" %
                           (msg.request.path, msg.response.content[:100]))
        else:
            pass
示例#12
0
def insert_domain_info_to_db(domaInfo):
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(domaInfo, db_conn)
    dbu.add_js_info_to_db(domaInfo, db_conn, site_info_id)
    db_conn.commit()
    db_conn.close()
示例#13
0
 def test_mysql_init_db(self):
     db_conn = dbu.mysql_init_db()
     self.assertTrue(db_conn, "Cannot initialize connection.")
示例#14
0
 def setUp(self):
     #test_db_file = ':memory:'
     #schema = '../swf_schema.sql'
     self.db_conn = dbu.mysql_init_db(
         'fp_detective_test')  # get a cursor for db
     self.db_cursor = self.db_conn.cursor(mdb.cursors.DictCursor)
示例#15
0
def insert_domain_info_to_db(domaInfo):
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(domaInfo, db_conn)
    dbu.add_js_info_to_db(domaInfo, db_conn, site_info_id)
    db_conn.commit()
    db_conn.close()