Пример #1
0
def parse_args():

    parser = SquidArgumentParser(
        description="Process a collection of squid logs and write certain extracted metrics to file",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "providers",
        metavar="PROVIDER_IDENTIFIER",
        nargs="*",
        default=DEFAULT_PROVIDERS,
        help="list of provider identifiers used in squid log file names",
    )
    parser.add_argument(
        "--name_format",
        dest="name_format",
        type=str,
        default="%s.tab.log-%.gz",
        help="a printf style format string which is formatted with the tuple: (provider_name, date_representation",
    )
    parser.set_defaults(datadir="/a/squid/archive/zero")

    args = parser.parse_args()
    # custom logic for which files to grab
    prov_files = {}
    for prov in args.providers:
        basename = "zero-%s" % prov
        logger.debug("basename: %s", basename)
        prov_files[prov] = get_files(args.start, args.end, args.datadir, basename)
    setattr(args, "squid_files", prov_files)

    logger.info(pprint.pformat(args.__dict__))
    return args
Пример #2
0
import pprint
import datetime
from squid import count_files, write_counts, get_files

urls = set([tuple(line.strip().split('/')[1:]) for line in open('urls.txt')])
pprint.pprint(urls)
glob = '/a/squid/archive/sampled/sampled-1000.tab.log-20130301.gz'
fields = ['date', 'project', 'country', 'title']
criteria = [
        lambda r : r.status_code() < 300,
        lambda r : r.url_path() and r.url_path()[0] == 'wiki',
        lambda r : r.project() == 'wikimediafoundation',
        lambda r : (len(r.url_path()) > 1 and r.url_path()[1] in ['FAQ', 'Ways_to_Give', 'Thank_You']) or  r.url_path() in urls]

files = get_files(start = datetime.date(2012,11,15),
        end = datetime.date(2013,01,16))
files.extend(get_files(start = datetime.date(2013,2,25),
        end = datetime.date(2013,4,1)))

counts = count_files(files,
        criteria=criteria,
        fields=fields,
        count_event=1000)

write_counts(counts, 'fundraising_pv_custom_init.csv')