def parse_args(): parser = SquidArgumentParser( description="Process a collection of squid logs and write certain extracted metrics to file", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "providers", metavar="PROVIDER_IDENTIFIER", nargs="*", default=DEFAULT_PROVIDERS, help="list of provider identifiers used in squid log file names", ) parser.add_argument( "--name_format", dest="name_format", type=str, default="%s.tab.log-%.gz", help="a printf style format string which is formatted with the tuple: (provider_name, date_representation", ) parser.set_defaults(datadir="/a/squid/archive/zero") args = parser.parse_args() # custom logic for which files to grab prov_files = {} for prov in args.providers: basename = "zero-%s" % prov logger.debug("basename: %s", basename) prov_files[prov] = get_files(args.start, args.end, args.datadir, basename) setattr(args, "squid_files", prov_files) logger.info(pprint.pformat(args.__dict__)) return args
import pprint import datetime from squid import count_files, write_counts, get_files urls = set([tuple(line.strip().split('/')[1:]) for line in open('urls.txt')]) pprint.pprint(urls) glob = '/a/squid/archive/sampled/sampled-1000.tab.log-20130301.gz' fields = ['date', 'project', 'country', 'title'] criteria = [ lambda r : r.status_code() < 300, lambda r : r.url_path() and r.url_path()[0] == 'wiki', lambda r : r.project() == 'wikimediafoundation', lambda r : (len(r.url_path()) > 1 and r.url_path()[1] in ['FAQ', 'Ways_to_Give', 'Thank_You']) or r.url_path() in urls] files = get_files(start = datetime.date(2012,11,15), end = datetime.date(2013,01,16)) files.extend(get_files(start = datetime.date(2013,2,25), end = datetime.date(2013,4,1))) counts = count_files(files, criteria=criteria, fields=fields, count_event=1000) write_counts(counts, 'fundraising_pv_custom_init.csv')