def main(): parser = squid.SquidArgumentParser('filters squid logs by provider ranges') squid.squidrow.load_cidr_ranges() parser.add_argument('country_code') parser.add_argument('-o', '--outdir', default='.', help='directory in which to place the filtered files') args = parser.parse_args() logger.info(pprint.pformat(args)) keepers = ['language', 'project', 'site', 'title'] criteria = [ lambda r : r.country_code2() == args.country_code, lambda r : r.old_init_request(), lambda r : r.datetime() > args.start, lambda r : r.datetime() < args.end, ] counts = squid.count_files(args.squid_files, keepers, criteria, count_event=1000, limit=args.max_lines, nproc=15, fname='%s_top_k_titles_incremental.csv' % args.country_code) squid.write_counts(counts, '%s_top_k_articles.csv' % args.country_code)
def main(): parser = squid.SquidArgumentParser('filters squid logs by provider ranges') squid.squidrow.load_cidr_ranges() parser.add_argument('provider', choices=squid.squidrow.cidr_ranges.keys(), help='name of a provider to filter by') parser.add_argument('-o', '--outdir', default='.', help='directory in which to place the filtered files') args = parser.parse_args() logger.info(pprint.pformat(args)) keepers = ['date', 'language', 'project', 'site', 'country', 'na'] criteria = [ lambda r : r.site() in ['M', 'Z'], lambda r : r.old_init_request(), lambda r : r.provider() == args.provider, lambda r : r.datetime() > args.start, lambda r : r.datetime() < args.end, ] counts = squid.count_files(args.squid_files, keepers, criteria, count_event=1000, limit=args.max_lines, nproc=15, fname='%s_counts_incremental.csv' % args.provider) squid.write_counts(counts, '%s_counts.csv' % args.provider)
def get_old(): if os.path.exists("old_counts.csv"): old_df = pd.read_csv("old_counts.csv") else: file_glob = "/a/squid/archive/zero/zero-orange-*.tab.log-201304*.gz" fields = ["date", "host", "x_cs", "cache_status", "status_code", "provider_from_fname"] criteria = [lambda r: r.site() == "M", lambda r: r.url_path()[0] == "wiki"] # lambda r : r.lang() in ['ar','en','fr','es','de','it','ru','ja','zh', None], # lambda r : r.init_request()] counts = squid.count_files(file_glob, fields, criteria, count_event=10) old_df = pd.DataFrame([key + (count,) for key, count in sorted(counts.items())], columns=fields + ["count"]) old_df = old_df.rename(columns={"provider_from_fname": "carrier"}) old_df["carrier"] = old_df["carrier"].apply(lambda c: "orange-tunisia" if c == "orange-tunesia" else c) old_df.to_csv("old_counts.csv", index=False) print old_df return old_df
def main(): args = parse_args() keepers = ["date", "lang", "project", "site", "na"] criteria = [lambda r: r.datetime() > args.start, lambda r: r.datetime() < args.end, lambda r: r.old_init_request()] for prov in args.providers: if not args.squid_files[prov]: logger.info("skipping provider: %s because no files were found", prov) continue counts = count_files(args.squid_files[prov], keepers, criteria, count_event=10) rows = [fields + (prov, count) for fields, count in counts.items()] rows = [map(str, row) for row in rows] rows.sort(key=itemgetter(*range(len(keepers)))) with open("%s.counts.csv" % prov, "w") as csvfile: writer = csv.writer(csvfile) writer.writerow(keepers + ["provider", "count"]) writer.writerows(rows)
def main(): args = parse_args() for provider in args.providers: counts = squid.count_files(args.squid_files[provider], fields = FIELDS, criteria = [ catch_date_error, lambda r : r['old_init_request'], lambda r : r['datetime'] >= args.start_date and r['datetime'] < args.end_date ]) counts_tabular = [list(k + (c,)) for k, c in counts.items()] for row in counts_tabular: row[FIELDS.index('date')] = row[FIELDS.index('date')].strftime('%m-%d-%y') row[FIELDS.index('providers_full')] = ';'.join(['%s:%s' % prov_full for prov_full in row[FIELDS.index('providers_full')]]) counts_tabular.sort(key=itemgetter(*range(len(FIELDS)))) fout = open('%s.counts.csv' % provider, 'w') csvout = csv.writer(fout) csvout.writerows(counts_tabular) fout.close()
from squid import count_files, write_counts from squid.util import SquidArgumentParser import logging import pprint logger = logging.getLogger(__name__) if __name__ == '__main__': parser = SquidArgumentParser() parser.set_defaults(basename='zero-.*', datadir='/a/squid/archive/zero') args = parser.parse_args() logger.info(pprint.pformat(vars(args))) criteria = [ lambda r : r.site() in ['M', 'Z'], lambda r : r.old_init_request(), lambda r : r.project == 'wikipedia', ] fields = ['date', 'language', 'project', 'site', 'na', 'provider_from_file'] counts = count_files(args.squid_files, fields, criteria, nproc=10, limit=args.max_lines, fname='carrier_counts_cidr_all.incremental.csv') write_counts(counts, 'carrier_counts_cidr_all.counts.csv')
import pprint import datetime from squid import count_files, write_counts, get_files urls = set([tuple(line.strip().split('/')[1:]) for line in open('urls.txt')]) pprint.pprint(urls) glob = '/a/squid/archive/sampled/sampled-1000.tab.log-20130301.gz' fields = ['date', 'project', 'country', 'title'] criteria = [ lambda r : r.status_code() < 300, lambda r : r.url_path() and r.url_path()[0] == 'wiki', lambda r : r.project() == 'wikimediafoundation', lambda r : (len(r.url_path()) > 1 and r.url_path()[1] in ['FAQ', 'Ways_to_Give', 'Thank_You']) or r.url_path() in urls] files = get_files(start = datetime.date(2012,11,15), end = datetime.date(2013,01,16)) files.extend(get_files(start = datetime.date(2013,2,25), end = datetime.date(2013,4,1))) counts = count_files(files, criteria=criteria, fields=fields, count_event=1000) write_counts(counts, 'fundraising_pv_custom_init.csv')