예제 #1
0
    def test_check_canvas_rw_access(self):
        common_url = "http://example.com/fp.js"
        be1 = cm.BrowserEvent()
        be1.event_type = cm.EVENT_TODATAURL
        be1.js_file = common_url

        be2 = cm.BrowserEvent()
        be2.event_type = cm.EVENT_FILLTEXT
        be2.js_file = common_url
        self.assertEqual([common_url], ex.check_canvas_rw_access([be1, be2]))
예제 #2
0
    def test_check_canvas_rw_access_for_diff_js(self):
        url1 = "http://example.com/fp.js"
        url2 = "http://example.com/ga.js"

        be1 = cm.BrowserEvent()
        be1.event_type = cm.EVENT_TODATAURL
        be1.js_file = url1

        be2 = cm.BrowserEvent()
        be2.event_type = cm.EVENT_FILLTEXT
        be2.js_file = url2
        self.assertEqual([], ex.check_canvas_rw_access([be1, be2]))
예제 #3
0
def ff_log_parser(log_file, visit_info):
    events = []
    fp_events = []
    for line in gen_cat_file(log_file):
        # print line
        if line.startswith("FPD"):
            line = line.replace("FPD\t", "").rstrip()
            be = cm.BrowserEvent()
            log_items = line.split("\t")
            if len(log_items) == 5:
                be.initiator, be.event_type, be.js_file,\
                    be.js_line, be.log_text = log_items
                be.url = visit_info.url
                be.rank = visit_info.rank
                events.append(be)
            elif len(log_items) == 4 and log_items[1] in cm.EVENTS_WITH_4_ARGS:
                be.initiator, cm.event_type, be.js_file,\
                    be.js_line = log_items
                # filter out screenshots taken by browser for page thumbnails
                if not be.js_line.startswith('resource'):
                    events.append(be)
            else:
                print "Hmm WAT!:", log_items

    canvas_fp_urls = check_canvas_rw_access(events)
    for canvas_fp_url in canvas_fp_urls:
        for event in events:
            # we only add events from this URL
            if event.js_file == canvas_fp_url:
                fp_events.append(event)
    return fp_events, events
예제 #4
0
def get_canvas_event(cur, canvas_ev_id):
    cur.execute("SELECT * FROM canvas WHERE id=:Id", {"Id": canvas_ev_id})
    canvas_row = cur.fetchone()
    be = cm.BrowserEvent()
    _, visit_id, be.url, be.js_file, be.js_line, be.event_type,\
        event_time, data_url_id = canvas_row
    return visit_id, data_url_id, event_time, be
예제 #5
0
    def test_update_visit(self):
        test_url = "http://example.com"
        start_time = strftime("%Y%m%d-%H%M%S")
        be = cm.BrowserEvent()
        be.event_type = cm.EVENT_NEW_VISIT
        vi = cm.VisitInfo()
        vi.url = test_url
        vi.start_time = start_time
        vi.out_db = self.test_db
        vi.duration = 0
        vi.incomplete = 1

        vi.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, vi)
        vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID,
                                   vi.visit_id)
        self.assertEqual(vi.duration, vi_read.duration)
        self.assertEqual(vi.incomplete, vi_read.incomplete)

        vi.duration = 33
        vi.incomplete = 0
        dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, vi)
        vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID,
                                   vi.visit_id)
        self.assertEqual(vi.duration, vi_read.duration)
        self.assertEqual(vi.incomplete, vi_read.incomplete)
예제 #6
0
def parse_strace_logs(visit_info, test_lso=None):
    events = []
    lines_seen = Set()
    # cm.print_log(visit_info, visit_info.sys_log, cm.LOG_DEBUG)
    for line in fu.gen_cat_file(visit_info.sys_log):
        if (".macromedia/Flash_Player/#SharedObjects" not in line
                or (line in lines_seen)):
            continue

        lines_seen.add(line)
        try:
            pieces = line.split("#SharedObjects/")[1].split("/")
            lso_file, mode_str = pieces[-1].split("\"")
            if "O_RDWR" in mode_str:
                mode = cm.ACCESS_MODE_READ_WRITE
            else:
                mode = cm.ACCESS_MODE_READ_ONLY
            domain = pieces[1]
            filename = line.split("\"")[1]
            file_ext = filename.rsplit(".", 1)[-1]
            # We observe .sxx extension for LSOs instead of .sol in strace logs
            # We recover the real filename by replacing
            # [pid 26407] open("/home/xyz/.macromedia/Flash_Player/#SharedObjects/GWZSHMBL/securehomes.esat.kuleuven.be/FlashCookie.sxx", O_RDWR|O_CREAT|O_APPEND, 0666) = 18  # noqa
            # [pid 26407] write(18, "\0\277\0\0\0-TCSO\0\4\0\0\0\0\0\vFlashCookie\0\0\0\3\21test_key\6\rjb0uf9\0", 51) = 51  # noqa
            # the only file in SharedObjects/GWZSHMBL/securehomes.esat.kuleuven.be would be a .sol file, which is not in the strace logs if it's created on this visit.  # noqa
            if file_ext == "sxx":
                filename = filename.replace(".sxx", ".sol")
            if test_lso is not None:  # just to simplify tests
                filename = test_lso  # override the lso filename
            if not filename.endswith(".sol"):
                print "Unexpected LSO file extension", filename, file_ext, line
                continue
            if not os.path.isfile(filename):
                # Happens when the (temp) LSO is removed before the visit ends.
                print "Cannot find LSO file", filename, file_ext, line
                continue
        except ValueError:
            continue
        # TODO: move below code to a separate function
        try:
            lso_dict = sol.load(filename)
            for k, v in lso_dict.iteritems():
                be = cm.BrowserEvent()
                be.event_type = cm.EVENT_FLASH_LSO
                be.js_file = lso_file
                be.initiator = domain
                be.mode = mode  # this doesn't seem to work
                be.cookie_path = filename.split("#SharedObjects/")[1]
                be.key = unicode(k)
                try:
                    be.log_text = unicode(v)
                except UnicodeDecodeError:  # obj is byte string
                    ascii_text = str(v).encode('string_escape')
                    be.log_text = unicode(ascii_text)
                events.append(be)
        except Exception as exc:
            cm.print_log(visit_info,
                         "Error parsing LSO %s %s" % (filename, exc))
    return events
예제 #7
0
 def test_r_w_visit_to_db(self):
     test_url = "http://example.com"
     start_time = strftime("%Y%m%d-%H%M%S")
     be = cm.BrowserEvent()
     be.event_type = cm.EVENT_NEW_VISIT
     vi = cm.VisitInfo()
     vi.url = test_url
     vi.start_time = start_time
     vi.out_db = self.test_db
     visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, vi)
     vi_read = dbu.get_db_entry(self.test_db, dbu.DBCmd.VISIT_BY_ID,
                                visit_id)
     self.assertEqual(vi.url, vi_read.url)
     self.assertEqual(vi.start_time, vi_read.start_time)
예제 #8
0
 def test_r_w_canvas_to_db(self):
     be = cm.BrowserEvent()
     be.event_type = cm.EVENT_TODATAURL
     be.url = "http://example.com"
     be.js_file = "http://example.com/fp.js"
     be.js_line = 5
     be.txt = "data:asdsads"
     vi = cm.VisitInfo()
     vi.visit_id = 1
     vi.out_db = self.test_db
     canvas_ev_id = dbu.insert_to_db(dbu.DBCmd.ADD_CANVAS, be, vi)
     self.assertGreater(canvas_ev_id, 0)
     visit_id, data_url_id, event_time, be_db = \
         dbu.get_db_entry(self.test_db, dbu.DBCmd.CANVAS_BY_ID,
                          canvas_ev_id)
     self.assertEqual(vi.visit_id, visit_id)
     self.assertEqual(data_url_id, 1)
     self.assertEqual(be_db.event_type, be.event_type)
     self.assertEqual(be_db.url, be.url)
     self.assertEqual(be_db.js_file, be.js_file)
     self.assertEqual(be_db.js_line, be.js_line)
     self.assertEqual(event_time, 0)
예제 #9
0
def visit_page(url_tuple,
               timeout=cm.HARD_TIME_OUT,
               wait_on_site=cm.WAIT_ON_SITE,
               pre_crawl_sleep=False,
               out_dir=cm.BASE_TMP_DIR,
               flash_support=cm.FLASH_ENABLE,
               cookie_support=cm.COOKIE_ALLOW_ALL):
    driver = None
    visit_info = cm.VisitInfo()
    try:
        visit_info.rank, visit_info.url = url_tuple
    except:
        # When rank of the page is not provided, we'll use rank=0
        visit_info.rank, visit_info.url = 0, url_tuple

    visit_info.sys_log = join(
        out_dir, "syscall-%s-%s.log" % (visit_info.rank, ut.rand_str()))
    visit_info.http_log = join(
        out_dir, "http-%s-%s.log" % (visit_info.rank, ut.rand_str()))
    visit_info.http_dump = join(
        out_dir, "mitm-%s-%s.dmp" % (visit_info.rank, ut.rand_str()))
    visit_info.start_time = strftime("%Y%m%d-%H%M%S")
    visit_info.out_dir = out_dir
    visit_info.out_db = join(visit_info.out_dir, cm.DB_FILENAME)
    visit_info.err_log = join(out_dir, "error.log")
    visit_info.debug_log = join(out_dir, "debug.log")

    be = cm.BrowserEvent()
    be.event_type = cm.EVENT_NEW_VISIT

    visit_info.ff_log = open_log_file(out_dir, visit_info.url)

    if not visit_info.url[:5] in ('data:', 'http:', 'https', 'file:'):
        visit_info.url = 'http://' + visit_info.url

    try:
        visit_info.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be,
                                               visit_info)
        cm.print_debug(
            visit_info, "Visiting: %s %s (%s)" %
            (visit_info.visit_id, visit_info.url, visit_info.rank))
        setup_nspr_logging(visit_info.http_log)
        visit_info.vdisplay = start_xvfb()
        port, visit_info.mitm_proc = start_mitm_capture(visit_info.http_dump)
        driver, visit_info.profile_dir, visit_info.sel_proc =\
            get_browser(visit_info.ff_log, port, flash_support, cookie_support)
        if flash_support:
            visit_info.strace_proc = log_syscalls(visit_info.sel_proc,
                                                  visit_info.sys_log)

        #############################################################
        driver_get(driver, visit_info, cm.SOFT_TIMEOUT)  # real visit
        #############################################################
        time.sleep(wait_on_site)
        close_driver(driver, timeout=10)
        stop_strace(visit_info.strace_proc)
        result_dict = process_crawler_output(visit_info.ff_log, visit_info,
                                             flash_support)
        cm.print_debug(
            visit_info, "Visit OK: %s %s (%s)" %
            (visit_info.visit_id, visit_info.url, visit_info.rank))
        visit_info.incomplete = 0
        dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, visit_info)
        quit_driver(driver)
        stop_xvfb(visit_info.vdisplay)
        remove_visit_files(visit_info)
    except (cm.TimeExceededError, sel_exceptions.TimeoutException) as texc:
        err_str = "Visit to %s(%s) timed out %s" % \
            (visit_info.url, visit_info.rank, texc)
        cm.print_error(visit_info, err_str)
        clean_up(visit_info, driver)
        return None
    except Exception as exc:
        err_str = "Exception visiting %s(%s) %s %s" % \
            (visit_info.url, visit_info.rank, exc, traceback.format_exc())
        cm.print_error(visit_info, err_str)
        clean_up(visit_info, driver)
        return None
    else:
        return result_dict