예제 #1
0
def main():

    p = create_option_parser()
    opts, files = p.parse_args()

    try:
        desired_pages_fn, dump_date, out_file = files
    except ValueError:
        p.error("Bad number of arguments! Try with --help option")

    ## creating dump date object
    dump = lib.yyyymmdd_to_datetime(dump_date).date()

    ## list of not-to-be-analyzed groups
    groups = [g for g in opts.groups.split(',') if g]

    ## creating processor
    processor = EventsProcessor(lang=opts.lang,
                                range_=opts.range_,
                                skip=opts.skip,
                                dump_date=dump,
                                groups=groups,
                                desired=opts.desired,
                                output_file=files[2])

    ## set desired pages
    processor.set_desired(desired_pages_fn)
    ## main process
    processor.process(threshold=opts.ratio)
def main():

    logging.basicConfig(#filename="graph_longiudinal_analysis.log",
                                stream=sys.stderr,
                                level=logging.DEBUG)
    logging.info('---------------------START---------------------')
    
    op = create_option_parser()
    args = op.parse_args()
    
    ## explode dump filename in order to obtain wiki lang, dump date and type
    lang, date_, type_ = mwlib.explode_dump_filename(args.file_name)
                    
    fn, start, tw = args.file_name, args.start, args.time_window
    ## if end argument is not specified, then use the dump date
    end = args.end if args.end else lib.yyyymmdd_to_datetime(date_)
    ## frequency not to be considered in case of cumulative analysis
    freq = args.frequency if (args.frequency and not args.cumulative) else tw
    
    if args.cumulative:
        logging.info("Cumulative longitudinal analysis chosen, hence not considering following option: frequency")

    with Timr("RUNNING ANALYSIS"):
        if args.cumulative:
            cumulative_analysis(fn, start, end, freq)
        else:
            time_slice_analysis(fn, start, end, freq, tw)
예제 #3
0
def main():
    logging.basicConfig(  #filename="graph_longiudinal_analysis.log",
        stream=sys.stderr,
        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    op = create_option_parser()
    args = op.parse_args()

    ## explode dump filename in order to obtain wiki lang, dump date and type
    _, date_, _ = mwlib.explode_dump_filename(args.file_name)

    fn, start, tw = args.file_name, args.start, args.time_window
    ## if end argument is not specified, then use the dump date
    end = args.end if args.end else lib.yyyymmdd_to_datetime(date_)
    ## frequency not to be considered in case of cumulative analysis
    freq = args.frequency if (args.frequency and not args.cumulative) else tw

    if args.cumulative:
        logging.info("Cumulative longitudinal analysis chosen,"
                     "hence not considering following option: frequency")

    with Timr("RUNNING ANALYSIS"):
        if args.cumulative:
            cumulative_analysis(fn, start, end, freq)
        else:
            time_slice_analysis(fn, start, end, freq, tw)
예제 #4
0
def main():

    p = create_option_parser()
    opts, files = p.parse_args()

    try:
        desired_pages_fn, dump_date, out_file = files
    except ValueError:
        p.error("Bad number of arguments! Try with --help option")

    ## creating dump date object
    dump = lib.yyyymmdd_to_datetime(dump_date).date()

    ## list of not-to-be-analyzed groups
    groups = [g for g in opts.groups.split(',') if g]

    ## creating processor
    processor = EventsProcessor(lang=opts.lang, range_=opts.range_,
                                skip=opts.skip, dump_date=dump, groups=groups,
                                desired=opts.desired, output_file=out_file)
    processor.encoding = opts.encoding
    ## set desired pages
    processor.set_desired(desired_pages_fn)
    ## main process
    processor.process(threshold=opts.ratio)
예제 #5
0
def main():
    from bz2 import BZ2File
    from csv import DictWriter

    logging.basicConfig(#filename="usercontributions_export.log",
                        stream=sys.stderr,
                        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    op = create_option_parser()
    args = op.parse_args()

    xml, out, threshold = args.dump, args.out, args.threshold

    lang, date_, _ = mwlib.explode_dump_filename(xml)
    deflate, _lineno = find_open_for_this_file(xml)

    date_ = yyyymmdd_to_datetime(date_, 1)

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tmp = ["Normal"]+[v for _, (_, v) in enumerate(mwlib.get_namespaces(src))]
    namespaces = []
    # fix for quartiles
    for ns in tmp:
        for n in range(1, 5):
            namespaces.append("%s_%d" % (ns, n))
    print namespaces

    fout = BZ2File(out, 'w')

    fields = ['username', 'normal_edits', 'comments_count', 'comments_avg',
              'minor', 'revert', 'npov', 'welcome', 'please', 'thanks',
              'first_edit', 'last_edit', 'tot_edits', 'active_days',
              'days_since_first_edit', 'left_since', 'diversity_score',
              'first_edit_year', 'first_edit_month', 'first_edit_day',
              'last_edit_year', 'last_edit_month', 'last_edit_day', ]
    fields[2:2] = namespaces
    dw = DictWriter(fout, fields)
    dw.writeheader()

    ## to get only the first 1000 users:
    #from itertools import islice
    #data_iterator = islice(prepare_data(namespaces), 1000)
    data_iterator = prepare_data(namespaces, lang, date_, threshold)

    count = 0
    for user in data_iterator:
        for k, v in user.iteritems():
            if type(v) in [int, float]:
                assert v >= 0, "%s is negative" % (k,)
        dw.writerow(user)

        count += 1
        if not count % 5000:
            logging.info(count)