示例#1
0
def main():
    parser = squid.SquidArgumentParser('filters squid logs by provider ranges')
    squid.squidrow.load_cidr_ranges()
    parser.add_argument('country_code')
    parser.add_argument('-o', '--outdir', default='.', help='directory in which to place the filtered files')
    args = parser.parse_args()
    logger.info(pprint.pformat(args))

    keepers = ['language', 'project', 'site', 'title']

    criteria = [
            lambda r : r.country_code2() == args.country_code,
            lambda r : r.old_init_request(),
            lambda r : r.datetime() > args.start,
            lambda r : r.datetime() < args.end,
    ]

    counts = squid.count_files(args.squid_files, 
            keepers, 
            criteria,
            count_event=1000,
            limit=args.max_lines,
            nproc=15,
            fname='%s_top_k_titles_incremental.csv' % args.country_code)

    squid.write_counts(counts, '%s_top_k_articles.csv' % args.country_code)
示例#2
0
def main():
    parser = squid.SquidArgumentParser('filters squid logs by provider ranges')
    squid.squidrow.load_cidr_ranges()
    parser.add_argument('provider', 
            choices=squid.squidrow.cidr_ranges.keys(),
            help='name of a provider to filter by')
    parser.add_argument('-o', '--outdir', default='.', help='directory in which to place the filtered files')
    args = parser.parse_args()
    logger.info(pprint.pformat(args))
   
    keepers = ['date', 'language', 'project', 'site', 'country', 'na']

    criteria = [
            lambda r : r.site() in ['M', 'Z'],
            lambda r : r.old_init_request(),
            lambda r : r.provider() == args.provider,
            lambda r : r.datetime() > args.start,
            lambda r : r.datetime() < args.end,
    ]

    counts = squid.count_files(args.squid_files, 
            keepers, 
            criteria,
            count_event=1000,
            limit=args.max_lines,
            nproc=15,
            fname='%s_counts_incremental.csv' % args.provider)

    squid.write_counts(counts, '%s_counts.csv' % args.provider)
示例#3
0
文件: compare.py 项目: embr/nonce
def get_old():
    if os.path.exists("old_counts.csv"):
        old_df = pd.read_csv("old_counts.csv")
    else:
        file_glob = "/a/squid/archive/zero/zero-orange-*.tab.log-201304*.gz"
        fields = ["date", "host", "x_cs", "cache_status", "status_code", "provider_from_fname"]
        criteria = [lambda r: r.site() == "M", lambda r: r.url_path()[0] == "wiki"]
        # lambda r : r.lang() in ['ar','en','fr','es','de','it','ru','ja','zh', None],
        # lambda r : r.init_request()]
        counts = squid.count_files(file_glob, fields, criteria, count_event=10)
        old_df = pd.DataFrame([key + (count,) for key, count in sorted(counts.items())], columns=fields + ["count"])
        old_df = old_df.rename(columns={"provider_from_fname": "carrier"})
        old_df["carrier"] = old_df["carrier"].apply(lambda c: "orange-tunisia" if c == "orange-tunesia" else c)
        old_df.to_csv("old_counts.csv", index=False)
    print old_df
    return old_df
示例#4
0
def main():
    args = parse_args()
    keepers = ["date", "lang", "project", "site", "na"]

    criteria = [lambda r: r.datetime() > args.start, lambda r: r.datetime() < args.end, lambda r: r.old_init_request()]

    for prov in args.providers:
        if not args.squid_files[prov]:
            logger.info("skipping provider: %s because no files were found", prov)
            continue
        counts = count_files(args.squid_files[prov], keepers, criteria, count_event=10)
        rows = [fields + (prov, count) for fields, count in counts.items()]
        rows = [map(str, row) for row in rows]
        rows.sort(key=itemgetter(*range(len(keepers))))
        with open("%s.counts.csv" % prov, "w") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(keepers + ["provider", "count"])
            writer.writerows(rows)
示例#5
0
文件: process_mr.py 项目: embr/nonce
def main():
    args = parse_args()
    for provider in args.providers:
        counts = squid.count_files(args.squid_files[provider],
                            fields = FIELDS, 
                            criteria = [
                                catch_date_error,
                                lambda r : r['old_init_request'],
                                lambda r : r['datetime'] >= args.start_date and r['datetime'] < args.end_date
                                ])
        counts_tabular = [list(k + (c,)) for k, c in counts.items()]
        for row in counts_tabular:
            row[FIELDS.index('date')] = row[FIELDS.index('date')].strftime('%m-%d-%y')
            row[FIELDS.index('providers_full')] = ';'.join(['%s:%s' % prov_full for prov_full in row[FIELDS.index('providers_full')]])
        counts_tabular.sort(key=itemgetter(*range(len(FIELDS))))
        fout = open('%s.counts.csv' % provider, 'w')
        csvout = csv.writer(fout)
        csvout.writerows(counts_tabular)
        fout.close()
from squid import count_files, write_counts
from squid.util import SquidArgumentParser
import logging
import pprint

logger = logging.getLogger(__name__)

if __name__ == '__main__':
    parser = SquidArgumentParser()
    parser.set_defaults(basename='zero-.*', datadir='/a/squid/archive/zero')
    args = parser.parse_args()
    logger.info(pprint.pformat(vars(args)))


    criteria = [
            lambda r : r.site() in ['M', 'Z'],
            lambda r : r.old_init_request(),
            lambda r : r.project == 'wikipedia',
            ]

    fields = ['date', 'language', 'project', 'site', 'na', 'provider_from_file']
    
    counts = count_files(args.squid_files,
            fields,
            criteria,
            nproc=10,
            limit=args.max_lines,
            fname='carrier_counts_cidr_all.incremental.csv')

    write_counts(counts, 'carrier_counts_cidr_all.counts.csv')
示例#7
0
文件: count.py 项目: embr/nonce
import pprint
import datetime
from squid import count_files, write_counts, get_files

urls = set([tuple(line.strip().split('/')[1:]) for line in open('urls.txt')])
pprint.pprint(urls)
glob = '/a/squid/archive/sampled/sampled-1000.tab.log-20130301.gz'
fields = ['date', 'project', 'country', 'title']
criteria = [
        lambda r : r.status_code() < 300,
        lambda r : r.url_path() and r.url_path()[0] == 'wiki',
        lambda r : r.project() == 'wikimediafoundation',
        lambda r : (len(r.url_path()) > 1 and r.url_path()[1] in ['FAQ', 'Ways_to_Give', 'Thank_You']) or  r.url_path() in urls]

files = get_files(start = datetime.date(2012,11,15),
        end = datetime.date(2013,01,16))
files.extend(get_files(start = datetime.date(2013,2,25),
        end = datetime.date(2013,4,1)))

counts = count_files(files,
        criteria=criteria,
        fields=fields,
        count_event=1000)

write_counts(counts, 'fundraising_pv_custom_init.csv')