Exemplo n.º 1
0
    def test_get_fp_from_reqs(self):
        for fp_url in fp_urls:
            self.assertTrue(lp.get_fp_from_reqs([
                fp_url,
            ]), 'Cannot find fp in %s' % fp_url)

        self.assertTrue(lp.get_fp_from_reqs(
            fp_urls))  # should return something different if it finds fp urls
        self.assertFalse(lp.get_fp_from_reqs(['foo.org', 'foo.com',
                                              'bar.org']))  #
Exemplo n.º 2
0
def parse_mitm_dump(basename, worker, crawl_id):
    dumpfile = basename + '.dmp'
    wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id))
    requests = []
    responses = []
    if os.path.isfile(dumpfile):
        fr = flow.FlowReader(open(dumpfile))
        try:
            for msg in fr.stream():
                requests.append(msg.request.get_url())
                # responses.append(msg.response.get_url())
                worker(
                    msg, crawl_id
                )  # this worker func should take care of db insertion, logging etc.
        except flow.FlowReadError as exc:
            pass
            #wl_log.critical("Error reading mitm dump %s" % exc)
    else:
        wl_log.critical("Cannot find mitm dump %s" % dumpfile)

    doma_info = lp.DomainInfo()
    doma_info.requests = requests
    doma_info.responses = responses
    doma_info.crawl_id = crawl_id
    doma_info.url = ""
    doma_info.fc_dbg_font_loads = []
    doma_info.fp_detected = lp.get_fp_from_reqs(requests)
    doma_info.log_complete = 1
    print os.path.basename(dumpfile[:-4]).split('-')[0]
    doma_info.rank = int(
        os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(doma_info, db_conn)

    # parse
    log_file = basename + '.txt'
    if not os.path.isfile(log_file):
        log_file = basename + '.' + MITM_LOG_EXTENSION

    insert_js_fun = functools.partial(lp.insert_js_info_to_db,
                                      site_info_id=site_info_id,
                                      db_conn=db_conn)
    lp.parse_crawl_log(log_file, insert_js_fun,
                       crawl_id)  # parse log, insert js info to db

    db_conn.commit()
    db_conn.close()
    wl_log.info("Parsed %s OK" % (dumpfile))
    if REMOVE_DMP_FILES:
        os.remove(dumpfile)
Exemplo n.º 3
0
def parse_mitm_dump(basename, worker, crawl_id):
    dumpfile = basename +'.dmp'
    wl_log.info("Will parse mitm dump %s for crawl: %s" % (dumpfile, crawl_id))
    requests = []
    responses = []
    if os.path.isfile(dumpfile):
        fr = flow.FlowReader(open(dumpfile))
        try: 
            for msg in fr.stream():
                requests.append(msg.request.get_url())
                # responses.append(msg.response.get_url())
                worker(msg, crawl_id) # this worker func should take care of db insertion, logging etc.
        except flow.FlowReadError as _:
            pass
            #wl_log.critical("Error reading mitm dump %s" % exc)
    else:
        wl_log.critical("Cannot find mitm dump %s" % dumpfile)
    
    doma_info = lp.DomainInfo()
    doma_info.requests = requests
    doma_info.responses = responses
    doma_info.crawl_id = crawl_id
    doma_info.url = ""
    doma_info.fc_dbg_font_loads = []
    doma_info.fp_detected = lp.get_fp_from_reqs(requests)
    doma_info.log_complete = 1
    print os.path.basename(dumpfile[:-4]).split('-')[0]
    doma_info.rank = int(os.path.basename(dumpfile).split('-')[0]) if '-' in dumpfile else 0
    db_conn = dbu.mysql_init_db()
    site_info_id = dbu.add_site_info_to_db(doma_info, db_conn)
    
    # parse 
    log_file = basename + '.txt'
    if not os.path.isfile(log_file):
        log_file = basename + '.' + MITM_LOG_EXTENSION
        
    insert_js_fun = functools.partial(lp.insert_js_info_to_db, site_info_id=site_info_id, db_conn=db_conn)
    lp.parse_crawl_log(log_file, insert_js_fun, crawl_id) # parse log, insert js info to db

    db_conn.commit()
    db_conn.close()
    wl_log.info("Parsed %s OK" % (dumpfile))
    if REMOVE_DMP_FILES:
        os.remove(dumpfile)
Exemplo n.º 4
0
 def test_get_fp_from_reqs(self):
     for fp_url in fp_urls: 
         self.assertTrue(lp.get_fp_from_reqs([fp_url,]), 'Cannot find fp in %s' % fp_url)
   
     self.assertTrue(lp.get_fp_from_reqs(fp_urls)) # should return something different if it finds fp urls
     self.assertFalse(lp.get_fp_from_reqs(['foo.org', 'foo.com', 'bar.org'])) #