def insert_all_visit_data(cursor, visit_info, db_jobs): for db_cmd, data in db_jobs.iteritems(): if not data: continue try: if db_cmd == DBCmd.ADD_CANVAS: insert_canvas_events(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_VISIT: insert_visit_event(cursor, visit_info, data) elif db_cmd == DBCmd.UPDATE_VISIT: update_visit(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_COOKIES: insert_ff_cookies(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_LOCALSTORAGE_ITEMS: insert_localstorage_items(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_INDEXEDDB_ITEMS: insert_indexed_db_items(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_CACHE_ITEMS: insert_cache_items(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_LSO_ITEMS: insert_flash_cookies(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_HTTP_HEADERS: insert_http_headers(cursor, visit_info, data) except (sq.InterfaceError, sq.ProgrammingError) as ie: cm.print_error(visit_info, "Error inserting to DB %s %s %s" % (visit_info.url, db_cmd, ie))
def insert_all_visit_data(cursor, visit_info, db_jobs): for db_cmd, data in db_jobs.iteritems(): if not data: continue try: if db_cmd == DBCmd.ADD_CANVAS: insert_canvas_events(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_VISIT: insert_visit_event(cursor, visit_info, data) elif db_cmd == DBCmd.UPDATE_VISIT: update_visit(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_COOKIES: insert_ff_cookies(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_LOCALSTORAGE_ITEMS: insert_localstorage_items(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_INDEXEDDB_ITEMS: insert_indexed_db_items(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_CACHE_ITEMS: insert_cache_items(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_LSO_ITEMS: insert_flash_cookies(cursor, visit_info, data) elif db_cmd == DBCmd.ADD_HTTP_HEADERS: insert_http_headers(cursor, visit_info, data) except (sq.InterfaceError, sq.ProgrammingError) as ie: cm.print_error( visit_info, "Error inserting to DB %s %s %s" % (visit_info.url, db_cmd, ie))
def visit_page(url_tuple, timeout=cm.HARD_TIME_OUT, wait_on_site=cm.WAIT_ON_SITE, pre_crawl_sleep=False, out_dir=cm.BASE_TMP_DIR, flash_support=cm.FLASH_ENABLE, cookie_support=cm.COOKIE_ALLOW_ALL): driver = None visit_info = cm.VisitInfo() try: visit_info.rank, visit_info.url = url_tuple except: # When rank of the page is not provided, we'll use rank=0 visit_info.rank, visit_info.url = 0, url_tuple visit_info.sys_log = join( out_dir, "syscall-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_log = join( out_dir, "http-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_dump = join( out_dir, "mitm-%s-%s.dmp" % (visit_info.rank, ut.rand_str())) visit_info.start_time = strftime("%Y%m%d-%H%M%S") visit_info.out_dir = out_dir visit_info.out_db = join(visit_info.out_dir, cm.DB_FILENAME) visit_info.err_log = join(out_dir, "error.log") visit_info.debug_log = join(out_dir, "debug.log") be = cm.BrowserEvent() be.event_type = cm.EVENT_NEW_VISIT visit_info.ff_log = open_log_file(out_dir, visit_info.url) if not visit_info.url[:5] in ('data:', 'http:', 'https', 'file:'): visit_info.url = 'http://' + visit_info.url try: visit_info.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, visit_info) cm.print_debug( visit_info, "Visiting: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) setup_nspr_logging(visit_info.http_log) visit_info.vdisplay = start_xvfb() port, visit_info.mitm_proc = start_mitm_capture(visit_info.http_dump) driver, visit_info.profile_dir, visit_info.sel_proc =\ get_browser(visit_info.ff_log, port, flash_support, cookie_support) if flash_support: visit_info.strace_proc = log_syscalls(visit_info.sel_proc, visit_info.sys_log) ############################################################# driver_get(driver, visit_info, cm.SOFT_TIMEOUT) # real visit ############################################################# time.sleep(wait_on_site) close_driver(driver, timeout=10) stop_strace(visit_info.strace_proc) result_dict = process_crawler_output(visit_info.ff_log, visit_info, flash_support) cm.print_debug( visit_info, "Visit OK: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) visit_info.incomplete = 0 dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, visit_info) quit_driver(driver) stop_xvfb(visit_info.vdisplay) remove_visit_files(visit_info) except (cm.TimeExceededError, sel_exceptions.TimeoutException) as texc: err_str = "Visit to %s(%s) timed out %s" % \ (visit_info.url, visit_info.rank, texc) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None except Exception as exc: err_str = "Exception visiting %s(%s) %s %s" % \ (visit_info.url, visit_info.rank, exc, traceback.format_exc()) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None else: return result_dict
def visit_page(url_tuple, timeout=cm.HARD_TIME_OUT, wait_on_site=cm.WAIT_ON_SITE, pre_crawl_sleep=False, out_dir=cm.BASE_TMP_DIR, flash_support=cm.FLASH_ENABLE, cookie_support=cm.COOKIE_ALLOW_ALL): driver = None visit_info = cm.VisitInfo() try: visit_info.rank, visit_info.url = url_tuple except: # When rank of the page is not provided, we'll use rank=0 visit_info.rank, visit_info.url = 0, url_tuple visit_info.sys_log = join(out_dir, "syscall-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_log = join(out_dir, "http-%s-%s.log" % (visit_info.rank, ut.rand_str())) visit_info.http_dump = join(out_dir, "mitm-%s-%s.dmp" % (visit_info.rank, ut.rand_str())) visit_info.start_time = strftime("%Y%m%d-%H%M%S") visit_info.out_dir = out_dir visit_info.out_db = join(visit_info.out_dir, cm.DB_FILENAME) visit_info.err_log = join(out_dir, "error.log") visit_info.debug_log = join(out_dir, "debug.log") be = cm.BrowserEvent() be.event_type = cm.EVENT_NEW_VISIT visit_info.ff_log = open_log_file(out_dir, visit_info.url) if not visit_info.url[:5] in ('data:', 'http:', 'https', 'file:'): visit_info.url = 'http://' + visit_info.url try: visit_info.visit_id = dbu.insert_to_db(dbu.DBCmd.ADD_VISIT, be, visit_info) cm.print_debug(visit_info, "Visiting: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) setup_nspr_logging(visit_info.http_log) visit_info.vdisplay = start_xvfb() port, visit_info.mitm_proc = start_mitm_capture(visit_info.http_dump) driver, visit_info.profile_dir, visit_info.sel_proc =\ get_browser(visit_info.ff_log, port, flash_support, cookie_support) if flash_support: visit_info.strace_proc = log_syscalls(visit_info.sel_proc, visit_info.sys_log) ############################################################# driver_get(driver, visit_info, cm.SOFT_TIMEOUT) # real visit ############################################################# time.sleep(wait_on_site) close_driver(driver, timeout=10) stop_strace(visit_info.strace_proc) result_dict = process_crawler_output(visit_info.ff_log, visit_info, flash_support) cm.print_debug(visit_info, "Visit OK: %s %s (%s)" % (visit_info.visit_id, visit_info.url, visit_info.rank)) visit_info.incomplete = 0 dbu.insert_to_db(dbu.DBCmd.UPDATE_VISIT, be, visit_info) quit_driver(driver) stop_xvfb(visit_info.vdisplay) remove_visit_files(visit_info) except (cm.TimeExceededError, sel_exceptions.TimeoutException) as texc: err_str = "Visit to %s(%s) timed out %s" % \ (visit_info.url, visit_info.rank, texc) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None except Exception as exc: err_str = "Exception visiting %s(%s) %s %s" % \ (visit_info.url, visit_info.rank, exc, traceback.format_exc()) cm.print_error(visit_info, err_str) clean_up(visit_info, driver) return None else: return result_dict