def parse_arguments(): # Read configuration file config = ConfigParser.RawConfigParser() config.read(cm.CONFIG_FILE) # Parse arguments parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.') # List of urls to be crawled parser.add_argument('-u', '--url-file', required=True, help='Path to the file that contains the list of URLs to crawl.', default=cm.LOCALIZED_DATASET) parser.add_argument('-t', '--type', choices=cm.CRAWLER_TYPES, help="Crawler type to use for this crawl.", default='Base') parser.add_argument('-o', '--output', help='Directory to dump the results (default=./results).', default=cm.CRAWL_DIR) parser.add_argument('-c', '--config', help="Crawler tor driver and controller configurations.", choices=config.sections(), default="default") parser.add_argument('-b', '--tbb-path', help="Path to the Tor Browser Bundle directory.", default=cm.TBB_DIR) parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity', default=False) # Crawler features parser.add_argument('-x', '--virtual-display', help='Dimensions of the virtual display, eg 1200x800', default='') parser.add_argument('-s', '--screenshots', action='store_true', help='Capture page screenshots', default=False) # Limit crawl parser.add_argument('--start', type=int, help='Select URLs from this line number: (default: 1).', default=1) parser.add_argument('--stop', type=int, help='Select URLs after this line number: (default: EOF).', default=maxsize) # Parse arguments args = parser.parse_args() # Set verbose level wl_log.setLevel(DEBUG if args.verbose else INFO) del args.verbose # Change results dir if output cm.CRAWL_DIR = args.output del args.output wl_log.debug("Command line parameters: %s" % argv) return args, config
def parse_arguments(): # Parse arguments parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.') # List of urls to be crawled parser.add_argument('-u', '--url-list', required=True, help='Path to the fail that contains the list of URLs to crawl.') parser.add_argument('-o', '--output', help='Directory to dump the results (default=./results).', default=cm.RESULTS_DIR) parser.add_argument('-b', '--tbb-path', help="Path to the Tor Browser Bundle directory.", default=cm.TBB_PATH) parser.add_argument("-e", "--experiment", choices=cm.EXP_TYPES, help="Specifies the crawling methodology.", default=cm.EXP_TYPE_WANG_AND_GOLDBERG) parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity', default=False) # For understanding batch and instance parameters please refer # to Wang and Goldberg's WPES'13 paper, Section 4.1.4 parser.add_argument('--batches', type=int, help='Number of batches in the crawl (default: %s)' % cm.NUM_BATCHES, default=cm.NUM_BATCHES) parser.add_argument('--instances', type=int, help='Number of instances to crawl for each web page (default: %s)' % cm.NUM_INSTANCES, default=cm.NUM_INSTANCES) # Crawler features parser.add_argument('-x', '--xvfb', action='store_true', help='Use XVFB (for headless testing)', default=False) parser.add_argument('-c', '--capture-screen', action='store_true', help='Capture page screenshots', default=False) # Limit crawl parser.add_argument('--start', type=int, help='Start crawling URLs from this line number: (default: 1).', default=1) parser.add_argument('--stop', type=int, help='Stop crawling URLs after this line number: (default: EOF).', default=maxsize) # Parse arguments args = parser.parse_args() # Set verbose level wl_log.setLevel(DEBUG if args.verbose else INFO) del args.verbose wl_log.debug("Command line parameters: %s" % argv) return args
def parse_arguments(): # Read configuration file config = ConfigParser.RawConfigParser() config.read(cm.CONFIG_FILE) # Parse arguments parser = argparse.ArgumentParser( description='Crawl a list of URLs in multiple batches.') # List of urls to be crawled parser.add_argument( '-u', '--url-file', required=True, help='Path to the file that contains the list of URLs to crawl.', default=cm.LOCALIZED_DATASET) parser.add_argument('-t', '--type', choices=cm.CRAWLER_TYPES, help="Crawler type to use for this crawl.", default='Base') parser.add_argument( '-o', '--output', help='Directory to dump the results (default=./results).', default=cm.CRAWL_DIR) parser.add_argument( '-c', '--config', help="Crawler tor driver and controller configurations.", choices=config.sections(), default="default") parser.add_argument('-b', '--tbb-path', help="Path to the Tor Browser Bundle directory.", default=cm.TBB_DIR) parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity', default=False) # Crawler features parser.add_argument('-x', '--virtual-display', help='Dimensions of the virtual display, eg 1200x800', default='') parser.add_argument('-s', '--screenshots', action='store_true', help='Capture page screenshots', default=False) # Limit crawl parser.add_argument( '--start', type=int, help='Select URLs from this line number: (default: 1).', default=1) parser.add_argument( '--stop', type=int, help='Select URLs after this line number: (default: EOF).', default=maxsize) # Parse arguments args = parser.parse_args() # Set verbose level wl_log.setLevel(DEBUG if args.verbose else INFO) del args.verbose # Change results dir if output cm.CRAWL_DIR = args.output del args.output wl_log.debug("Command line parameters: %s" % argv) return args, config
default=False) args = parser.parse_args() url_list_path = args.url_list verbose = args.verbose tbb_version = args.browser_version no_of_batches = int(args.batch) no_of_instances = int(args.instance) start_line = int(args.start) if args.start else 1 stop_line = int(args.stop) if args.stop else 999999999999 xvfb = args.xvfb capture_screen = args.capture_screen if verbose: wl_log.setLevel(logging.DEBUG) else: wl_log.setLevel(logging.INFO) # Validate the given arguments # Read urls url_list = np.loadtxt(url_list_path, delimiter='\n', dtype=str) url_list = url_list.tolist() url_list = url_list[start_line - 1:stop_line] torrc_dict = cm.TORRC_DEFAULT if not tbb_version: tbb_version = cm.TBB_DEFAULT_VERSION elif tbb_version not in cm.TBB_KNOWN_VERSIONS: ut.die('Version of Tor browser is not recognized.')
def parse_arguments(): # Read configuration file config = ConfigParser.RawConfigParser() config.read(cm.CONFIG_FILE) # Parse arguments parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.') # List of urls to be crawled parser.add_argument('-u', '--urls', required=True, help='Path to the file that contains the list of URLs to crawl,' ' or a comma-separated list of URLs.', default=cm.LOCALIZED_DATASET) parser.add_argument('-t', '--type', choices=cm.CRAWLER_TYPES, help="Crawler type to use for this crawl.", default='Base') parser.add_argument('-o', '--output', help='Directory to dump the results (default=./results).', default=cm.CRAWL_DIR) parser.add_argument('-i', '--crawl-id', help='String used as crawl ID (default=DATE).', default=None) parser.add_argument('-e', '--addons_dir', help='Directory with the add-ons to be installed (default=None).', default=None) parser.add_argument('-c', '--config', help="Crawler tor driver and controller configurations.", choices=config.sections(), default="default") parser.add_argument('-b', '--tbb-path', help="Path to the Tor Browser Bundle directory.", default=cm.TBB_DIR) parser.add_argument('-f', '--tor-binary-path', help="Path to the Tor binary.") parser.add_argument('-g', '--tor-data-path', help="Path to the Tor data directory.") parser.add_argument('-v', '--verbose', action='store_true', help='increase output verbosity', default=False) parser.add_argument('-r', '--recover-file', help="File with checkpoint to recover from.", default=None) # Crawler features parser.add_argument('-x', '--virtual-display', help='Dimensions of the virtual display, eg 1200x800', default='') parser.add_argument('-s', '--screenshots', action='store_true', help='Capture page screenshots', default=False) parser.add_argument('-d', '--device', help='Interface to sniff the network traffic', choices=cm.IFACES, default='eth0') # Limit crawl parser.add_argument('--start', type=int, help='Select URLs from this line number: (default: 1).', default=1) parser.add_argument('--stop', type=int, help='Select URLs after this line number: (default: EOF).', default=maxsize) # Parse arguments args = parser.parse_args() # Set verbose level wl_log.setLevel(DEBUG if args.verbose else INFO) del args.verbose # Set crawl ID if args.crawl_id: cm.set_crawl_id(args.crawl_id) del args.crawl_id # Change results dir if output cm.CRAWL_DIR = abspath(args.output) cm.LOGS_DIR = join(cm.CRAWL_DIR, 'logs') cm.CRAWL_LOG_FILENAME = join(cm.LOGS_DIR, 'crawl.log') cm.TOR_LOG_FILENAME = join(cm.LOGS_DIR, 'tor.log') if args.recover_file is not None: if isfile(cm.CRAWL_LOG_FILENAME): move(cm.CRAWL_LOG_FILENAME, cm.CRAWL_LOG_FILENAME + '.' + cm.CRAWL_ID) if isfile(cm.TOR_LOG_FILENAME): move(cm.TOR_LOG_FILENAME, cm.TOR_LOG_FILENAME + '.' + cm.CRAWL_ID) del args.output # Set local IP addresses = ifaddresses(args.device) ips = addresses.setdefault(AF_INET, [{'addr': 'No IP'}]) cm.LOCAL_IP = ips[0]['addr'] wl_log.debug("Command line parameters: %s" % argv) return args, config