def parse_arguments():
    # Read configuration file
    config = ConfigParser.RawConfigParser()
    config.read(cm.CONFIG_FILE)

    # Parse arguments
    parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.')

    # List of urls to be crawled
    parser.add_argument('-u', '--url-file', required=True,
                        help='Path to the file that contains the list of URLs to crawl.',
                        default=cm.LOCALIZED_DATASET)
    parser.add_argument('-t', '--type',
                        choices=cm.CRAWLER_TYPES,
                        help="Crawler type to use for this crawl.",
                        default='Base')
    parser.add_argument('-o', '--output',
                        help='Directory to dump the results (default=./results).',
                        default=cm.CRAWL_DIR)
    parser.add_argument('-c', '--config',
                        help="Crawler tor driver and controller configurations.",
                        choices=config.sections(),
                        default="default")
    parser.add_argument('-b', '--tbb-path',
                        help="Path to the Tor Browser Bundle directory.",
                        default=cm.TBB_DIR)
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='increase output verbosity',
                        default=False)

    # Crawler features
    parser.add_argument('-x', '--virtual-display',
                        help='Dimensions of the virtual display, eg 1200x800',
                        default='')
    parser.add_argument('-s', '--screenshots', action='store_true',
                        help='Capture page screenshots',
                        default=False)

    # Limit crawl
    parser.add_argument('--start', type=int,
                        help='Select URLs from this line number: (default: 1).',
                        default=1)
    parser.add_argument('--stop', type=int,
                        help='Select URLs after this line number: (default: EOF).',
                        default=maxsize)

    # Parse arguments
    args = parser.parse_args()

    # Set verbose level
    wl_log.setLevel(DEBUG if args.verbose else INFO)
    del args.verbose

    # Change results dir if output
    cm.CRAWL_DIR = args.output
    del args.output

    wl_log.debug("Command line parameters: %s" % argv)
    return args, config
示例#2
0
def parse_arguments():
    # Parse arguments
    parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.')

    # List of urls to be crawled
    parser.add_argument('-u', '--url-list', required=True,
                        help='Path to the fail that contains the list of URLs to crawl.')
    parser.add_argument('-o', '--output',
                        help='Directory to dump the results (default=./results).',
                        default=cm.RESULTS_DIR)
    parser.add_argument('-b', '--tbb-path',
                        help="Path to the Tor Browser Bundle directory.",
                        default=cm.TBB_PATH)
    parser.add_argument("-e", "--experiment", choices=cm.EXP_TYPES,
                        help="Specifies the crawling methodology.",
                        default=cm.EXP_TYPE_WANG_AND_GOLDBERG)
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='increase output verbosity',
                        default=False)

    # For understanding batch and instance parameters please refer
    # to Wang and Goldberg's WPES'13 paper, Section 4.1.4
    parser.add_argument('--batches', type=int,
                        help='Number of batches in the crawl (default: %s)' % cm.NUM_BATCHES,
                        default=cm.NUM_BATCHES)
    parser.add_argument('--instances', type=int,
                        help='Number of instances to crawl for each web page (default: %s)' % cm.NUM_INSTANCES,
                        default=cm.NUM_INSTANCES)

    # Crawler features
    parser.add_argument('-x', '--xvfb', action='store_true',
                        help='Use XVFB (for headless testing)',
                        default=False)
    parser.add_argument('-c', '--capture-screen', action='store_true',
                        help='Capture page screenshots',
                        default=False)

    # Limit crawl
    parser.add_argument('--start', type=int,
                        help='Start crawling URLs from this line number: (default: 1).',
                        default=1)
    parser.add_argument('--stop', type=int,
                        help='Stop crawling URLs after this line number: (default: EOF).',
                        default=maxsize)

    # Parse arguments
    args = parser.parse_args()

    # Set verbose level
    wl_log.setLevel(DEBUG if args.verbose else INFO)
    del args.verbose

    wl_log.debug("Command line parameters: %s" % argv)

    return args
示例#3
0
def parse_arguments():
    # Read configuration file
    config = ConfigParser.RawConfigParser()
    config.read(cm.CONFIG_FILE)

    # Parse arguments
    parser = argparse.ArgumentParser(
        description='Crawl a list of URLs in multiple batches.')

    # List of urls to be crawled
    parser.add_argument(
        '-u',
        '--url-file',
        required=True,
        help='Path to the file that contains the list of URLs to crawl.',
        default=cm.LOCALIZED_DATASET)
    parser.add_argument('-t',
                        '--type',
                        choices=cm.CRAWLER_TYPES,
                        help="Crawler type to use for this crawl.",
                        default='Base')
    parser.add_argument(
        '-o',
        '--output',
        help='Directory to dump the results (default=./results).',
        default=cm.CRAWL_DIR)
    parser.add_argument(
        '-c',
        '--config',
        help="Crawler tor driver and controller configurations.",
        choices=config.sections(),
        default="default")
    parser.add_argument('-b',
                        '--tbb-path',
                        help="Path to the Tor Browser Bundle directory.",
                        default=cm.TBB_DIR)
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='increase output verbosity',
                        default=False)

    # Crawler features
    parser.add_argument('-x',
                        '--virtual-display',
                        help='Dimensions of the virtual display, eg 1200x800',
                        default='')
    parser.add_argument('-s',
                        '--screenshots',
                        action='store_true',
                        help='Capture page screenshots',
                        default=False)

    # Limit crawl
    parser.add_argument(
        '--start',
        type=int,
        help='Select URLs from this line number: (default: 1).',
        default=1)
    parser.add_argument(
        '--stop',
        type=int,
        help='Select URLs after this line number: (default: EOF).',
        default=maxsize)

    # Parse arguments
    args = parser.parse_args()

    # Set verbose level
    wl_log.setLevel(DEBUG if args.verbose else INFO)
    del args.verbose

    # Change results dir if output
    cm.CRAWL_DIR = args.output
    del args.output

    wl_log.debug("Command line parameters: %s" % argv)
    return args, config
示例#4
0
                        default=False)

    args = parser.parse_args()

    url_list_path = args.url_list
    verbose = args.verbose
    tbb_version = args.browser_version
    no_of_batches = int(args.batch)
    no_of_instances = int(args.instance)
    start_line = int(args.start) if args.start else 1
    stop_line = int(args.stop) if args.stop else 999999999999
    xvfb = args.xvfb
    capture_screen = args.capture_screen

    if verbose:
        wl_log.setLevel(logging.DEBUG)
    else:
        wl_log.setLevel(logging.INFO)

    # Validate the given arguments
    # Read urls
    url_list = np.loadtxt(url_list_path, delimiter='\n', dtype=str)
    url_list = url_list.tolist()
    url_list = url_list[start_line - 1:stop_line]
    torrc_dict = cm.TORRC_DEFAULT

    if not tbb_version:
        tbb_version = cm.TBB_DEFAULT_VERSION
    elif tbb_version not in cm.TBB_KNOWN_VERSIONS:
        ut.die('Version of Tor browser is not recognized.')
def parse_arguments():
    # Read configuration file
    config = ConfigParser.RawConfigParser()
    config.read(cm.CONFIG_FILE)

    # Parse arguments
    parser = argparse.ArgumentParser(description='Crawl a list of URLs in multiple batches.')

    # List of urls to be crawled
    parser.add_argument('-u', '--urls', required=True,
                        help='Path to the file that contains the list of URLs to crawl,'
                             ' or a comma-separated list of URLs.',
                        default=cm.LOCALIZED_DATASET)
    parser.add_argument('-t', '--type',
                        choices=cm.CRAWLER_TYPES,
                        help="Crawler type to use for this crawl.",
                        default='Base')
    parser.add_argument('-o', '--output',
                        help='Directory to dump the results (default=./results).',
                        default=cm.CRAWL_DIR)
    parser.add_argument('-i', '--crawl-id',
                        help='String used as crawl ID (default=DATE).',
                        default=None)
    parser.add_argument('-e', '--addons_dir',
                        help='Directory with the add-ons to be installed (default=None).',
                        default=None)
    parser.add_argument('-c', '--config',
                        help="Crawler tor driver and controller configurations.",
                        choices=config.sections(),
                        default="default")
    parser.add_argument('-b', '--tbb-path',
                        help="Path to the Tor Browser Bundle directory.",
                        default=cm.TBB_DIR)
    parser.add_argument('-f', '--tor-binary-path',
                        help="Path to the Tor binary.")
    parser.add_argument('-g', '--tor-data-path',
                        help="Path to the Tor data directory.")
    parser.add_argument('-v', '--verbose', action='store_true',
                        help='increase output verbosity',
                        default=False)
    parser.add_argument('-r', '--recover-file',
                        help="File with checkpoint to recover from.",
                        default=None)

    # Crawler features
    parser.add_argument('-x', '--virtual-display',
                        help='Dimensions of the virtual display, eg 1200x800',
                        default='')
    parser.add_argument('-s', '--screenshots', action='store_true',
                        help='Capture page screenshots',
                        default=False)
    parser.add_argument('-d', '--device',
                        help='Interface to sniff the network traffic',
                        choices=cm.IFACES,
                        default='eth0')

    # Limit crawl
    parser.add_argument('--start', type=int,
                        help='Select URLs from this line number: (default: 1).',
                        default=1)
    parser.add_argument('--stop', type=int,
                        help='Select URLs after this line number: (default: EOF).',
                        default=maxsize)

    # Parse arguments
    args = parser.parse_args()

    # Set verbose level
    wl_log.setLevel(DEBUG if args.verbose else INFO)
    del args.verbose

    # Set crawl ID
    if args.crawl_id:
        cm.set_crawl_id(args.crawl_id)
    del args.crawl_id

    # Change results dir if output
    cm.CRAWL_DIR = abspath(args.output)
    cm.LOGS_DIR = join(cm.CRAWL_DIR, 'logs')
    cm.CRAWL_LOG_FILENAME = join(cm.LOGS_DIR, 'crawl.log')
    cm.TOR_LOG_FILENAME = join(cm.LOGS_DIR, 'tor.log')

    if args.recover_file is not None:
        if isfile(cm.CRAWL_LOG_FILENAME):
            move(cm.CRAWL_LOG_FILENAME, cm.CRAWL_LOG_FILENAME + '.' + cm.CRAWL_ID)
        if isfile(cm.TOR_LOG_FILENAME):
            move(cm.TOR_LOG_FILENAME, cm.TOR_LOG_FILENAME + '.' + cm.CRAWL_ID)

    del args.output

    # Set local IP
    addresses = ifaddresses(args.device)
    ips = addresses.setdefault(AF_INET, [{'addr': 'No IP'}])
    cm.LOCAL_IP = ips[0]['addr']

    wl_log.debug("Command line parameters: %s" % argv)
    return args, config