def main(): p = create_option_parser() opts, files = p.parse_args() try: desired_pages_fn, dump_date, out_file = files except ValueError: p.error("Bad number of arguments! Try with --help option") ## creating dump date object dump = lib.yyyymmdd_to_datetime(dump_date).date() ## list of not-to-be-analyzed groups groups = [g for g in opts.groups.split(',') if g] ## creating processor processor = EventsProcessor(lang=opts.lang, range_=opts.range_, skip=opts.skip, dump_date=dump, groups=groups, desired=opts.desired, output_file=files[2]) ## set desired pages processor.set_desired(desired_pages_fn) ## main process processor.process(threshold=opts.ratio)
def main(): logging.basicConfig(#filename="graph_longiudinal_analysis.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') op = create_option_parser() args = op.parse_args() ## explode dump filename in order to obtain wiki lang, dump date and type lang, date_, type_ = mwlib.explode_dump_filename(args.file_name) fn, start, tw = args.file_name, args.start, args.time_window ## if end argument is not specified, then use the dump date end = args.end if args.end else lib.yyyymmdd_to_datetime(date_) ## frequency not to be considered in case of cumulative analysis freq = args.frequency if (args.frequency and not args.cumulative) else tw if args.cumulative: logging.info("Cumulative longitudinal analysis chosen, hence not considering following option: frequency") with Timr("RUNNING ANALYSIS"): if args.cumulative: cumulative_analysis(fn, start, end, freq) else: time_slice_analysis(fn, start, end, freq, tw)
def main(): logging.basicConfig( #filename="graph_longiudinal_analysis.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') op = create_option_parser() args = op.parse_args() ## explode dump filename in order to obtain wiki lang, dump date and type _, date_, _ = mwlib.explode_dump_filename(args.file_name) fn, start, tw = args.file_name, args.start, args.time_window ## if end argument is not specified, then use the dump date end = args.end if args.end else lib.yyyymmdd_to_datetime(date_) ## frequency not to be considered in case of cumulative analysis freq = args.frequency if (args.frequency and not args.cumulative) else tw if args.cumulative: logging.info("Cumulative longitudinal analysis chosen," "hence not considering following option: frequency") with Timr("RUNNING ANALYSIS"): if args.cumulative: cumulative_analysis(fn, start, end, freq) else: time_slice_analysis(fn, start, end, freq, tw)
def main(): p = create_option_parser() opts, files = p.parse_args() try: desired_pages_fn, dump_date, out_file = files except ValueError: p.error("Bad number of arguments! Try with --help option") ## creating dump date object dump = lib.yyyymmdd_to_datetime(dump_date).date() ## list of not-to-be-analyzed groups groups = [g for g in opts.groups.split(',') if g] ## creating processor processor = EventsProcessor(lang=opts.lang, range_=opts.range_, skip=opts.skip, dump_date=dump, groups=groups, desired=opts.desired, output_file=out_file) processor.encoding = opts.encoding ## set desired pages processor.set_desired(desired_pages_fn) ## main process processor.process(threshold=opts.ratio)
def main(): from bz2 import BZ2File from csv import DictWriter logging.basicConfig(#filename="usercontributions_export.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') op = create_option_parser() args = op.parse_args() xml, out, threshold = args.dump, args.out, args.threshold lang, date_, _ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) date_ = yyyymmdd_to_datetime(date_, 1) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) tmp = ["Normal"]+[v for _, (_, v) in enumerate(mwlib.get_namespaces(src))] namespaces = [] # fix for quartiles for ns in tmp: for n in range(1, 5): namespaces.append("%s_%d" % (ns, n)) print namespaces fout = BZ2File(out, 'w') fields = ['username', 'normal_edits', 'comments_count', 'comments_avg', 'minor', 'revert', 'npov', 'welcome', 'please', 'thanks', 'first_edit', 'last_edit', 'tot_edits', 'active_days', 'days_since_first_edit', 'left_since', 'diversity_score', 'first_edit_year', 'first_edit_month', 'first_edit_day', 'last_edit_year', 'last_edit_month', 'last_edit_day', ] fields[2:2] = namespaces dw = DictWriter(fout, fields) dw.writeheader() ## to get only the first 1000 users: #from itertools import islice #data_iterator = islice(prepare_data(namespaces), 1000) data_iterator = prepare_data(namespaces, lang, date_, threshold) count = 0 for user in data_iterator: for k, v in user.iteritems(): if type(v) in [int, float]: assert v >= 0, "%s is negative" % (k,) dw.writerow(user) count += 1 if not count % 5000: logging.info(count)