def histogram(training_set, ruleset, trainee, training_cache, delay, tabs, show_browser, buckets, rules): """Show a histogram of rule scores. We also break down what proportion of each bucket comprised positive or negative samples. Altogether, this gives you an idea whether a rule is broadly applicable, discriminatory, and spitting out what you expect. """ training_set = Path(training_set) if training_set.is_dir(): if not ruleset: raise BadOptionUsage( 'ruleset', 'A --ruleset file must be specified when TRAINING_SET_FOLDER is passed a directory.' ) if not trainee: raise BadOptionUsage( 'trainee', 'A --trainee ID must be specified when TRAINING_SET_FOLDER is passed a directory.' ) training_data = make_or_find_vectors(ruleset, trainee, training_set, training_cache, show_browser, 'training', delay, tabs) training_pages = training_data['pages'] x, y, num_yes, _ = tensors_from(training_pages) feature_names = training_data['header']['featureNames'] print_feature_report( feature_metrics(feature_names, x, y, buckets, rules or feature_names))
def get_tortoise_config(ctx: Context, tortoise_orm: str) -> dict: """ get tortoise config from module :param ctx: :param tortoise_orm: :return: """ splits = tortoise_orm.split(".") config_path = ".".join(splits[:-1]) tortoise_config = splits[-1] try: config_module = importlib.import_module(config_path) except (ModuleNotFoundError, AttributeError): raise BadOptionUsage(ctx=ctx, message=f'No config named "{config_path}"', option_name="--config") config = getattr(config_module, tortoise_config, None) if not config: raise BadOptionUsage( option_name="--config", message= f'Can\'t get "{tortoise_config}" from module "{config_module}"', ctx=ctx, ) return config
def main(testing_set, weights, confidence_threshold, ruleset, trainee, testing_cache, delay, show_browser, verbose): """Compute the accuracy of the given coefficients and biases on a folder of testing samples. TESTING_SET_FOLDER is a directory of labeled testing pages. It can also be, for backward compatibility, a JSON file of vectors from FathomFox's Vectorizer. WEIGHTS should be a JSON-formatted object like this. You can paste it directly from the output of fathom-train. {"coeffs": [["nextAnchorIsJavaScript", 1.1627885103225708], ["nextButtonTypeSubmit", 4.613410949707031], ["nextInputTypeSubmit", 4.374269008636475]], "bias": -8.645608901977539} """ testing_set = Path(testing_set) if testing_set.is_dir(): if not ruleset: raise BadOptionUsage( 'ruleset', 'A --ruleset file must be specified when TESTING_SET_FOLDER is passed a directory.' ) if not trainee: raise BadOptionUsage( 'trainee', 'A --trainee ID must be specified when TESTING_SET_FOLDER is passed a directory.' ) with make_or_find_vectors(ruleset, trainee, testing_set, testing_cache, show_browser, 'testing', delay).open(encoding='utf-8') as testing_file: testing_data = load(testing_file) testing_pages = testing_data['pages'] x, y, num_yes = tensors_from(testing_pages) model = model_from_json(weights, len(y[0]), testing_data['header']['featureNames']) accuracy, false_positives, false_negatives = accuracy_per_tag( y, model(x), confidence_threshold) print( pretty_accuracy('Testing', accuracy, len(x), false_positives, false_negatives, num_yes)) if testing_pages and 'time' in testing_pages[0]: print(speed_readout(testing_pages)) if verbose: print('\nTesting per-tag results:') print_per_tag_report([ per_tag_metrics(page, model, confidence_threshold) for page in testing_pages ])
def check_settings(self): if not self.settings.telegram_bot_token: raise BadOptionUsage( 'channel', 'Telegram bot token is required for the Telegram channel' ) if not self.settings.telegram_chat_id: raise BadOptionUsage( 'channel', 'Telegram Chat ID is required for the Telegram channel' )
def get_tortoise_config(ctx: Context, tortoise_orm: str) -> dict: """ get tortoise config from module :param ctx: :param tortoise_orm: :return: """ splits = tortoise_orm.split(".") config_path = ".".join(splits[:-1]) tortoise_config = splits[-1] try: config_module = importlib.import_module(config_path) except ModuleNotFoundError as e: raise ClickException( f"Error while importing configuration module: {e}") from None config = getattr(config_module, tortoise_config, None) if not config: raise BadOptionUsage( option_name="--config", message= f'Can\'t get "{tortoise_config}" from module "{config_module}"', ctx=ctx, ) return config
def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, interesting_files, stdout_flags, progress_enabled, timeout, max_depth, not_follow_subdomains, exclude_sources, proxies, delay, not_allow_redirects): """Find web directories without bruteforce """ if exclude_flags and include_flags: raise BadOptionUsage('--exclude-flags and --include-flags are mutually exclusive.') welcome() if not urls: click.echo('•_•) OOPS! Add urls to analyze.\nFor example: dirhunt http://domain/path\n\n' 'Need help? Then use dirhunt --help', err=True) return exclude_flags, include_flags = flags_range(exclude_flags), flags_range(include_flags) progress_enabled = (sys.stdout.isatty() or sys.stderr.isatty()) if progress_enabled is None else progress_enabled crawler = Crawler(max_workers=threads, interesting_extensions=interesting_extensions, interesting_files=interesting_files, std=sys.stdout if sys.stdout.isatty() else sys.stderr, progress_enabled=progress_enabled, timeout=timeout, depth=max_depth, not_follow_subdomains=not_follow_subdomains, exclude_sources=exclude_sources, not_allow_redirects=not_allow_redirects, proxies=proxies, delay=delay) crawler.add_init_urls(*urls) try: catch_keyboard_interrupt(crawler.print_results, crawler.restart)(set(exclude_flags), set(include_flags)) except SystemExit: crawler.close() crawler.print_urls_info() if not sys.stdout.isatty(): output_urls(crawler, stdout_flags)
def main(database: str, output: str) -> None: """Export rdial data for use with timew. Writes timew compatible data to ‘output’. """ if exists(output): raise BadOptionUsage('output', 'Output path must not exist') files = process_events(database) write_events(output, files)
def main(input: TextIO, output: str) -> None: """Export timew data for use with rdial. Reads the output of ‘timew export’, and writes rdial compatible data to ‘output’. """ if path.exists(output): raise BadOptionUsage('output', 'Output path must not exist') files = process_records(input) write_events(output, files)
def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, interesting_files, stdout_flags, progress_enabled, timeout, max_depth, not_follow_subdomains, exclude_sources, proxies, delay, not_allow_redirects, limit, to_file): """Find web directories without bruteforce """ if exclude_flags and include_flags: raise BadOptionUsage('--exclude-flags and --include-flags are mutually exclusive.') welcome() urls = flat_list(urls) proxies = multiplier_args(proxies) if not urls: click.echo('•_•) OOPS! Add urls to analyze.\nFor example: dirhunt http://domain/path\n\n' 'Need help? Then use dirhunt --help', err=True) return exclude_flags, include_flags = flags_range(exclude_flags), flags_range(include_flags) progress_enabled = (sys.stdout.isatty() or sys.stderr.isatty()) if progress_enabled is None else progress_enabled crawler = Crawler(max_workers=threads, interesting_extensions=interesting_extensions, interesting_files=interesting_files, std=sys.stdout if sys.stdout.isatty() else sys.stderr, progress_enabled=progress_enabled, timeout=timeout, depth=max_depth, not_follow_subdomains=not_follow_subdomains, exclude_sources=exclude_sources, not_allow_redirects=not_allow_redirects, proxies=proxies, delay=delay, limit=limit, to_file=to_file) if os.path.exists(crawler.get_resume_file()): click.echo('Resuming the previous program execution...') try: crawler.resume(crawler.get_resume_file()) except IncompatibleVersionError as e: click.echo(e) crawler.add_init_urls(*urls) while True: choice = catch_keyboard_interrupt_choices(crawler.print_results, ['abort', 'continue', 'results'], 'a')\ (set(exclude_flags), set(include_flags)) if choice == 'a': crawler.close(True) click.echo('Created resume file "{}". Run again using the same parameters to resume.'.format( crawler.get_resume_file()) ) return elif choice == 'c': crawler.restart() continue else: break crawler.print_urls_info() if not sys.stdout.isatty(): output_urls(crawler, stdout_flags) if to_file: crawler.create_report(to_file) if not to_file and os.path.exists(crawler.get_resume_file()): # The resume file exists. Deleting... os.remove(crawler.get_resume_file())
def get_app_connection_name(config, app_name: str) -> str: """ get connection name :param config: :param app_name: :return: """ app = config.get("apps").get(app_name) if app: return app.get("default_connection", "default") raise BadOptionUsage( option_name="--app", message=f'Can\'t get app named "{app_name}"', )
def handle_cli_args( self, url=None, platform=None, model_name=None, scenario_name=None, version=None, local_data=None, _store_as=("platform_info", "scenario_info"), ): """Handle command-line arguments. May update the :attr:`data_path`, :attr:`platform_info`, :attr:`scenario_info`, and/or :attr:`url` settings. """ # Store the path to command-specific data and metadata if local_data: self.local_data = local_data # References to the Context settings to be updated platform_info = self.setdefault(_store_as[0], dict()) scenario_info = self.setdefault(_store_as[1], dict()) # Store information for the target Platform if url: if platform or model_name or scenario_name or version: raise BadOptionUsage( "--platform --model --scenario and/or --version", " redundant with --url", ) self.url = url urlinfo = ixmp.utils.parse_url(url) platform_info.update(urlinfo[0]) scenario_info.update(urlinfo[1]) elif platform: platform_info["name"] = platform # Store information about the target Scenario if model_name: scenario_info["model"] = model_name if scenario_name: scenario_info["scenario"] = scenario_name if version: scenario_info["version"] = version
def __init__(self, settings: Settings): self.settings = settings self.mailbox = MailBox(self.settings.imap_hostname) self.mailbox.login(self.settings.imap_username, self.settings.imap_password) self.spam_filters = [ re.compile(spam_filter) for spam_filter in settings.spam_filters ] self.channels: typing.Dict[str, BaseChannel] = {} for channel_name in settings.channels: channel_class = self.CHANNELS.get(channel_name) if channel_class: self.channels[channel_name] = channel_class(settings) else: raise BadOptionUsage('channel', f'Channel {channel_name} is not defined')
def main(training_set, validation_set, ruleset, trainee, training_cache, validation_cache, delay, show_browser, stop_early, learning_rate, iterations, pos_weight, comment, quiet, confidence_threshold, layers, exclude): """Compute optimal numerical parameters for a Fathom ruleset. The usual invocation is something like this:: fathom-train samples/training --validation-set samples/validation --ruleset rulesets.js --trainee new The first argument is a directory of labeled training pages. It can also be, for backward compatibility, a JSON file of vectors from FathomFox's Vectorizer. To see graphs of loss functions, install TensorBoard, then run ``tensorboard --logdir runs/``. These will tell you whether you need to adjust the ``--learning-rate``. Definitions of terms used in output: \b pruned Said of a node that was prematurely eliminated from consideration because it did not match the selector of any ``dom()`` call in the ruleset \b target A "right answer": a labeled, positive DOM node, one that should be recognized. """ training_set = Path(training_set) # If they pass in a dir for either the training or validation sets, we need # a ruleset and a trainee for vectorizing: if (validation_set and validation_set.is_dir()) or training_set.is_dir(): if not ruleset: raise BadOptionUsage( 'ruleset', 'A --ruleset file must be specified when TRAINING_SET_FOLDER or --validation-set are passed a directory.' ) if not trainee: raise BadOptionUsage( 'trainee', 'A --trainee ID must be specified when TRAINING_SET_FOLDER or --validation-set are passed a directory.' ) training_data = exclude_features( exclude, make_or_find_vectors(ruleset, trainee, training_set, training_cache, show_browser, 'training', delay)) training_pages = training_data['pages'] x, y, num_yes, num_prunes = tensors_from(training_pages, shuffle=True) num_samples = len(x) + num_prunes if validation_set: validation_pages = exclude_features( exclude, make_or_find_vectors(ruleset, trainee, validation_set, validation_cache, show_browser, 'validation', delay))['pages'] validation_ins, validation_outs, validation_yes, validation_prunes = tensors_from( validation_pages) validation_arg = validation_ins, validation_outs else: validation_arg = None layers = list(layers) # Comes in as tuple full_comment = '.LR={l},i={i}{c}'.format(l=learning_rate, i=iterations, c=(',' + comment) if comment else '') model = learn(learning_rate, iterations, x, y, confidence_threshold, num_prunes, validation=validation_arg, stop_early=stop_early, run_comment=full_comment, pos_weight=pos_weight, layers=layers) print(pretty_coeffs(model, training_data['header']['featureNames'])) accuracy, false_positives, false_negatives = accuracy_per_tag( y, model(x), confidence_threshold, num_prunes) print( pretty_accuracy('Training', accuracy, num_samples, false_positives, false_negatives, num_yes + num_prunes)) if validation_set: accuracy, false_positives, false_negatives = accuracy_per_tag( validation_outs, model(validation_ins), confidence_threshold, validation_prunes) print( pretty_accuracy('Validation', accuracy, len(validation_ins), false_positives, false_negatives, validation_yes + validation_prunes)) # Print timing information: if training_pages and 'time' in training_pages[0]: if validation_set and validation_pages and 'time' in validation_pages[ 0]: print(speed_readout(training_pages + validation_pages)) else: print(speed_readout(training_pages)) if not quiet: print('\nTraining per-tag results:') print_per_tag_report([ per_tag_metrics(page, model, confidence_threshold) for page in training_pages ]) if validation_set: print('\nValidation per-tag results:') print_per_tag_report([ per_tag_metrics(page, model, confidence_threshold) for page in validation_pages ])
def option_error(): raise BadOptionUsage("Incorrect option", "What a pity")
def fetcher(config, show_list, fetchers, check, pool_size, https_only, http_check_https, no_socks, save): from .proxyfetcher import ProxyFetcher from .proxychecker import ProxyChecker if show_list: for fetcher in ProxyFetcher.registry.values(): echo(fetcher.name +' '+ fetcher.__module__ + ':' + fetcher.__name__) return proxies = OrderedDict() checker = None if check: conf = config.get('proxyfetcher', {}) if http_check_https: conf['https_force_check'] = True if pool_size: conf['pool_size'] = pool_size blacklist = conf.pop('blacklist', None) if not blacklist: conf['blacklist'] = proxies else: # Do not check already checked proxies conf['blacklist'] = CompositeContains(blacklist, proxies) checker = ProxyChecker(**conf) json_encoder = JSONEncoder(**config.get('json', {})) def proxy(proxy): if proxy.addr in proxies: proxies[proxy.addr].merge_meta(proxy) else: proxies[proxy.addr] = proxy conf = config.get('proxyfetcher', {}) fetchers_ = conf.pop('fetchers', None) if fetchers == '*': fetchers_ = ProxyFetcher.registry elif fetchers: fetchers_ = fetchers.split(',') if not fetchers: raise BadOptionUsage('You should specify fetchers with option or in config.') types = set(t.upper() for t in conf.pop('types', ['HTTP', 'HTTPS', 'SOCKS4', 'SOCKS5'])) if https_only and not http_check_https: types = set(('HTTPS', 'SOCKS4', 'SOCKS5')) if no_socks: types = types.difference(['SOCKS4', 'SOCKS5']) if not types: raise BadOptionUsage('Proxy types appears to be empty. ' 'Check config and options compability.') if pool_size: conf['pool_size'] = pool_size fetcher = ProxyFetcher(fetchers_, checker=checker, proxy=proxy, types=types, **conf) fetcher(join=True) http_count, socks_count, sources = 0, 0, {} for p in proxies.values(): if tuple(p.types)[0].name.startswith('HTTP'): http_count += 1 else: socks_count += 1 for source in p.fetch_sources: sources.setdefault(source, {'total': 0, 'uniq': 0}) sources[source]['total'] += 1 if len(p.fetch_sources) == 1: sources[source]['uniq'] += 1 sources = ', '.join(['{0}:total={1[total]} uniq={1[uniq]}'.format(k, v) for k, v in sources.items()]) logging.info('Fetched %s proxies (http(s)=%s, socks=%s %s)', len(proxies), http_count, socks_count, sources) json_encoder.dump(proxies.values(), save or sys.stdout)
def main(training_set, validation_set, ruleset, trainee, training_cache, validation_cache, delay, show_browser, stop_early, learning_rate, iterations, pos_weight, comment, quiet, confidence_threshold, layers, exclude): """Compute optimal numerical parameters for a Fathom ruleset. There are a lot of options, but the usual invocation is something like... fathom-train samples/training --validation-set samples/validation --stop-early --ruleset rulesets.js --trainee new TRAINING_SET_FOLDER is a directory of labeled training pages. It can also be, for backward compatibility, a JSON file of vectors from FathomFox's Vectorizer. To see graphs of the results, install TensorBoard, then run this: tensorboard --logdir runs/. These will tell you whether you need to adjust the --learning-rate. Some vocab used in the output messages: target -- A "right answer" DOM node, one that should be recognized candidate -- Any node (target or not) brought into the ruleset, by a dom() call, for consideration negative sample -- A sample with no intended target nodes, used to bait the recognizer into a false-positive choice """ training_set = Path(training_set) # If they pass in a dir for either the training or validation sets, we need # a ruleset and a trainee for vectorizing: if (validation_set and validation_set.is_dir()) or training_set.is_dir(): if not ruleset: raise BadOptionUsage('ruleset', 'A --ruleset file must be specified when TRAINING_SET_FOLDER or --validation-set are passed a directory.') if not trainee: raise BadOptionUsage('trainee', 'A --trainee ID must be specified when TRAINING_SET_FOLDER or --validation-set are passed a directory.') with open(make_or_find_vectors(ruleset, trainee, training_set, training_cache, show_browser, 'training', delay), encoding='utf-8') as training_file: training_data = exclude_features(exclude, load(training_file)) training_pages = training_data['pages'] x, y, num_yes = tensors_from(training_pages, shuffle=True) if validation_set: with open(make_or_find_vectors(ruleset, trainee, validation_set, validation_cache, show_browser, 'validation', delay), encoding='utf-8') as validation_file: validation_pages = exclude_features(exclude, load(validation_file))['pages'] validation_ins, validation_outs, validation_yes = tensors_from(validation_pages) validation_arg = validation_ins, validation_outs else: validation_arg = None layers = list(layers) # Comes in as tuple full_comment = '.LR={l},i={i}{c}'.format( l=learning_rate, i=iterations, c=(',' + comment) if comment else '') model = learn(learning_rate, iterations, x, y, confidence_threshold, validation=validation_arg, stop_early=stop_early, run_comment=full_comment, pos_weight=pos_weight, layers=layers) print(pretty_coeffs(model, training_data['header']['featureNames'])) accuracy, false_positives, false_negatives = accuracy_per_tag(y, model(x), confidence_threshold) print(pretty_accuracy('Training', accuracy, len(x), false_positives, false_negatives, num_yes)) if validation_set: accuracy, false_positives, false_negatives = accuracy_per_tag(validation_outs, model(validation_ins), confidence_threshold) print(pretty_accuracy('Validation', accuracy, len(validation_ins), false_positives, false_negatives, validation_yes)) # Print timing information: if training_pages and 'time' in training_pages[0]: if validation_set and validation_pages and 'time' in validation_pages[0]: print(speed_readout(training_pages + validation_pages)) else: print(speed_readout(training_pages)) if not quiet: print('\nTraining per-tag results:') print_per_tag_report([per_tag_metrics(page, model, confidence_threshold) for page in training_pages]) if validation_set: print('\nValidation per-tag results:') print_per_tag_report([per_tag_metrics(page, model, confidence_threshold) for page in validation_pages])