Пример #1
0
def crawl(crawl_info):
    # modified get function with specific browser
    if isfile(crawl_info.urls):
        url_tuples = wu.gen_url_list(crawl_info.max_rank,
                                     crawl_info.min_rank, True,
                                     crawl_info.urls)
    else:
        url_tuples = [(0, crawl_info.urls), ]  # a single url has been passed

    machine_id = read_machine_id()
    suffix = "_%s_FL%s_CO%s_%s_%s" % (machine_id, crawl_info.flash_support,
                                      crawl_info.cookie_support,
                                      crawl_info.min_rank,
                                      crawl_info.max_rank)
    out_dir, crawl_name = create_job_folder(suffix)
    # copy_mitm_certs()
    db_file = join(out_dir, cm.DB_FILENAME)

    report_file = join(out_dir, "%s.html" % crawl_name)
    print "Crawl name:", crawl_name
    dbu.create_db_from_schema(db_file)
    custom_get = partial(run_cmd, out_dir=out_dir,
                         flash_support=crawl_info.flash_support,
                         cookie_support=crawl_info.cookie_support)
    parallelize.run_in_parallel(url_tuples, custom_get,
                                crawl_info.max_parallel_procs)
    gr.gen_crawl_report(db_file, report_file)
    # clean_tmp_files(out_dir)
    zipped = pack_data(out_dir)
    if crawl_info.upload_data:
        ssh.scp_put_to_server(zipped)
        ssh.scp_put_to_server(report_file)
Пример #2
0
def crawl(crawl_info):
    # modified get function with specific browser
    if isfile(crawl_info.urls):
        url_tuples = wu.gen_url_list(crawl_info.max_rank, crawl_info.min_rank,
                                     True, crawl_info.urls)
    else:
        url_tuples = [
            (0, crawl_info.urls),
        ]  # a single url has been passed

    machine_id = read_machine_id()
    suffix = "_%s_FL%s_CO%s_%s_%s" % (machine_id, crawl_info.flash_support,
                                      crawl_info.cookie_support,
                                      crawl_info.min_rank, crawl_info.max_rank)
    out_dir, crawl_name = create_job_folder(suffix)
    # copy_mitm_certs()
    db_file = join(out_dir, cm.DB_FILENAME)

    report_file = join(out_dir, "%s.html" % crawl_name)
    print "Crawl name:", crawl_name
    dbu.create_db_from_schema(db_file)
    custom_get = partial(run_cmd,
                         out_dir=out_dir,
                         flash_support=crawl_info.flash_support,
                         cookie_support=crawl_info.cookie_support)
    parallelize.run_in_parallel(url_tuples, custom_get,
                                crawl_info.max_parallel_procs)
    gr.gen_crawl_report(db_file, report_file)
    # clean_tmp_files(out_dir)
    zipped = pack_data(out_dir)
    if crawl_info.upload_data:
        ssh.scp_put_to_server(zipped)
        ssh.scp_put_to_server(report_file)
Пример #3
0
 def setUp(self):
     self.vi = cm.VisitInfo()
     self.vi.visit_id = 1
     self.test_db = os.path.join(cm.BASE_TMP_DIR, cm.DB_FILENAME)
     self.vi.out_db = self.test_db
     self.vi.out_dir = cm.BASE_TMP_DIR
     self.vi.err_log = os.path.join(cm.BASE_TMP_DIR, "err.log")
     self.vi.sys_log = os.path.join(cm.BASE_TMP_DIR, "sys.log")
     self.vi.ff_log = os.path.join(cm.BASE_TMP_DIR, "ff_test.log")
     self.vi.log_options = [cm.LOG_TO_FILE, cm.LOG_TO_CONSOLE]
     self.vi.url = "http://xyz.org"
     dbu.create_db_from_schema(self.test_db)
     self.files_to_remove.extend([self.test_db, self.vi.sys_log,
                                  self.vi.ff_log])