def parse_args(): parser = SquidArgumentParser(description='Process a collection of squid logs and write certain extracted metrics to file') parser.add_argument('providers', metavar='PROVIDER_IDENTIFIER', nargs='*', default=DEFAULT_PROVIDERS, help='list of provider identifiers used in squid log file names') parser.add_argument('--name_format', dest='name_format', type=str, default='%s.log-%.gz', help='a printf style format string which is formatted with the tuple: (provider_name, date_representation') parser.set_defaults(datadir='/a/squid/archive/zero') args = parser.parse_args() # custom logic for which files to grab prov_files = {} for prov in args.providers: args.basename = prov logging.info('args prior to ge_files: %s', pprint.pformat(args.__dict__)) prov_files[prov] = SquidArgumentParser.get_files(args) setattr(args, 'squid_files', prov_files) logging.info(pprint.pformat(args.__dict__)) return args
#!/usr/bin/python from squid import count_files, write_counts from squid.util import SquidArgumentParser import logging import pprint logger = logging.getLogger(__name__) if __name__ == '__main__': parser = SquidArgumentParser() parser.set_defaults(basename='zero-.*', datadir='/a/squid/archive/zero') args = parser.parse_args() logger.info(pprint.pformat(vars(args))) criteria = [ lambda r : r.site() in ['M', 'Z'], lambda r : r.old_init_request(), lambda r : r.project == 'wikipedia', ] fields = ['date', 'language', 'project', 'site', 'na', 'provider_from_file'] counts = count_files(args.squid_files, fields, criteria, nproc=10, limit=args.max_lines, fname='carrier_counts_cidr_all.incremental.csv') write_counts(counts, 'carrier_counts_cidr_all.counts.csv')