def main(): files = sys.argv[1:] cache_path = 'geo_counts.csv' if os.path.exists(cache_path): counts = load_cache(cache_path) else: counts = count_files(files, keepers, criteria) write_counts(counts, cache_path) plot_error_vs_time(counts, dates) plot_country_error_vs_time(counts,dates) plot_mom_vs_time(counts,dates) basic_stats(counts)
def main(): opts = parse_args() criteria = (lambda r : r.site() in ['M', 'Z'], lambda r : r.old_init_request()) fields = ('date', 'lang', 'project', 'site', 'country_code2', 'provider_from_fname') counts = count_files(opts['glob'], fields, criteria, count_event=10, fname='carrier.local.all.incremental', limit=10000) df = pd.DataFrame([v + (c,) for v,c in counts.items()], columns = ['date', 'lang', 'project', 'site', 'country', 'carrier', 'count']) df.date = df.date.apply(lambda d : datetime.datetime.strftime(d, '%Y-%m-%d')) logger.info('carriers: %s', pprint.pformat(df.carrier.unique())) df.to_csv('carrier.local.all', index=False, sep='\t')
def main(): parser = SquidArgumentParser() parser.add_argument('--nprocs', default=10) args = parser.parse_args() logger.info(pprint.pformat(args.__dict__)) keepers = ['date', 'language', 'project', 'site', 'country', 'na'] criteria = [ lambda r : r.old_init_request(), lambda r : r.site() == 'M', lambda r : r.datetime() > args.start, lambda r : r.datetime() < args.end, ] counts = count_files(args.squid_files, keepers, criteria, count_event=1000, limit=args.max_lines, nproc=15, fname='country_counts_incremental.csv') write_counts(counts, 'country_counts.csv')
from squid.mapreduce import count_files import sys import pandas as pd from operator import itemgetter fields = ["country", "x_cs_str", "provider", "date", "site", "lang", "project"] counts = count_files([sys.argv[1]], fields) lines = [key + (count,) for key, count in counts.items()] lines.sort(key=itemgetter(*range(len(fields) + 1))) df = pd.DataFrame(lines, columns=fields + ["count"]) df.to_csv("%s.counts.csv" % sys.argv[1], index=False)