def setUp(self): xml = "tests/utpedits2graph/" + \ "vecwiki-20100307-stub-meta-history-TEST.xml.bz2" self.lang, self.date_, self.type_ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) welcome = defaultdict(str) welcome.update({'it': r'Benvenut', 'en': r'Welcome'}) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) tag = mwlib.get_tags(src, tags='page,title,revision,timestamp,contributor,' 'username,ip,comment,id') translations = mwlib.get_translations(src) try: lang_user = unicode(translations['User']) lang_user_talk = unicode(translations['User talk']) except UnicodeDecodeError: lang_user = smart_str(translations['User']) lang_user_talk = smart_str(translations['User talk']) src.close() src = deflate(xml) self.processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk")) self.processor.welcome_pattern = welcome[self.lang] self.processor.start(src) self.g = self.processor.get_network()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file desired_list output_file") p.add_option('-t', '--type', action="store", dest="type", default="all", help="Type of page to analize (content|talk|all)") p.add_option('-e', '--encoding', action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-T', "--timeout", action="store", dest="timeout", type=float, default=0.5, help="Diff timeout (default=0.5, 0=no timeout)") p.add_option('-c', '--clean', action="store_true", dest="clean", default=False, help="Cleans HTML, wiki syntax, acronyms and emoticons") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] desired_pages_fn = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect') src.close() src = deflate(xml) out = open(output, 'w') processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang, output=out, userns=translation['User']) processor.talkns = translation['Talk'] if opts.type == 'talk': processor.get_articles = False elif opts.type == 'content': processor.get_talks = False processor.diff_timeout = opts.timeout processor.clean = opts.clean processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding) with Timr('Processing'): processor.start(src) ## PROCESSING processor.flush() out.close()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file gender_file output_file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-e', '--min-edits', default=0, dest="min_edits", metavar="MIN_EDITS", type=int, help="pages with less than MIN_EIDTS edits " "are skipped (default: %(default)s)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] gender_data = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags="page,redirect,timestamp,ip," "contributor,title,username") src.close() src = deflate(xml) out = open(output, "w") processor = GenderPageProcessor(tag=tag, lang=lang, output=out, userns=translation['User'], gender_data=gender_data, min_edits=opts.min_edits) with Timr('Processing'): processor.start(src) # PROCESSING processor.flush() out.close()
def main(): logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) # filename="graph_longiudinal_analysis.log", logging.info("---------------------START---------------------") opts, args = opt_parse() xml = args[0] ## SET UP FOR PROCESSING lang, date_, type_ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) welcome = defaultdict(str) welcome.update({"it": r"Benvenut", "en": r"Welcome"}) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) tag = mwlib.get_tags(src, tags="page,title,revision,timestamp,contributor,username,ip,comment") translations = mwlib.get_translations(src) try: lang_user = unicode(translations["User"]) lang_user_talk = unicode(translations["User talk"]) except UnicodeDecodeError: lang_user = smart_str(translations["User"]) lang_user_talk = smart_str(translations["User talk"]) assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" src.close() src = deflate(xml) processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk")) processor.time_start = opts.start processor.time_end = opts.end processor.welcome_pattern = welcome[lang] with Timr("Processing"): processor.start(src) ## PROCESSING with Timr("Getting network"): g = processor.get_network() logging.info("Nodes: %d" % len(g.vs)) logging.info("Edges: %d" % len(g.es)) with Timr("Saving graph"): save_graph(g, lang, type_, date_)
def main(): import optparse import csv p = optparse.OptionParser( usage="usage: %prog [options] file desired_list acceptance_ratio") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") opts, files = p.parse_args() if opts.verbose: import sys, logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) if len(files) != 3: p.error("Wrong parameters") xml = files[0] desired_pages_fn = files[1] threshold = float(files[2]) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) with open(desired_pages_fn, 'rb') as f: desired_pages = [ l[0].decode('latin-1') for l in csv.reader(f) if l and not l[0][0] == '#' ] if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,'+ \ 'minor,timestamp,redirect,ip,username') src.close() src = deflate(xml) processor = HistoryEventsPageProcessor(tag=tag, lang=lang) processor.talkns = translation['Talk'] processor.threshold = threshold processor.set_desired(desired_pages) with Timr('Retrieving bots'): processor.set_bots() print "BEGIN PARSING" with Timr('Parsing'): processor.start(src) processor.flush()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file gender_file output_file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-e', '--min-edits', default=0, dest="min_edits", metavar="MIN_EDITS", type=int, help="pages with less than MIN_EIDTS edits " "are skipped (default: %(default)s)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] gender_data = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags="page,redirect,timestamp,ip," "contributor,title,username") src.close() src = deflate(xml) out = open(output, "w") processor = GenderPageProcessor(tag=tag, lang=lang, output=out, userns=translation['User'], gender_data=gender_data, min_edits=opts.min_edits ) with Timr('Processing'): processor.start(src) ## PROCESSING processor.flush() out.close()
def main(): opts, args = opt_parse() xml = args[0] ## SET UP FOR PROCESSING lang, date_, type_ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) tag = mwlib.get_tags( src, tags='page,title,revision,timestamp,contributor,username,ip,comment') translations = mwlib.get_translations(src) lang_user = unicode(translations['User']) lang_user_talk = unicode(translations['User talk']) assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" src.close() print >> sys.stderr, "BEGIN PARSING" src = deflate(xml) processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk")) processor.time_start = opts.start processor.time_end = opts.end ##TODO: only works on it.wikipedia.org! :-) processor.welcome_pattern = r'Benvenut' with Timr('Processing'): processor.start(src) ## PROCESSING with Timr('EdgeCache.get_network()'): g = processor.get_network() print >> sys.stderr, "Nodes:", len(g.vs) print >> sys.stderr, "Edges:", len(g.es) for e in g.es: e['weight'] = len(e['timestamp']) #e['timestamp'] = str(e['timestamp']) with Timr('Pickling'): g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
def main(): import optparse import csv p = optparse.OptionParser( usage="usage: %prog [options] file desired_list acceptance_ratio") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") opts, files = p.parse_args() if opts.verbose: import sys, logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) if len(files) != 3: p.error("Wrong parameters") xml = files[0] desired_pages_fn = files[1] threshold = float(files[2]) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) with open(desired_pages_fn, 'rb') as f: desired_pages = [l[0].decode('latin-1') for l in csv.reader(f) if l and not l[0][0] == '#'] if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,'+ \ 'minor,timestamp,redirect,ip,username') src.close() src = deflate(xml) processor = HistoryEventsPageProcessor(tag=tag, lang=lang) processor.talkns = translation['Talk'] processor.threshold = threshold processor.set_desired(desired_pages) with Timr('Retrieving bots'): processor.set_bots() print "BEGIN PARSING" with Timr('Parsing'): processor.start(src) processor.flush()
def main(): opts, args = opt_parse() xml = args[0] ## SET UP FOR PROCESSING lang, date_, type_ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) tag = mwlib.get_tags(src, tags='page,title,revision,timestamp,contributor,username,ip,comment') translations = mwlib.get_translations(src) lang_user = unicode(translations['User']) lang_user_talk = unicode(translations['User talk']) assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" src.close() print >>sys.stderr, "BEGIN PARSING" src = deflate(xml) processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk")) processor.time_start = opts.start processor.time_end = opts.end ##TODO: only works on it.wikipedia.org! :-) processor.welcome_pattern = r'Benvenut' with Timr('Processing'): processor.start(src) ## PROCESSING with Timr('EdgeCache.get_network()'): g = processor.get_network() print >>sys.stderr, "Nodes:", len(g.vs) print >>sys.stderr, "Edges:", len(g.es) for e in g.es: e['weight'] = len(e['timestamp']) #e['timestamp'] = str(e['timestamp']) with Timr('Pickling'): g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] file desired_list acceptance_ratio") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") opts, files = p.parse_args() if opts.verbose: import logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) if not files: p.error("Give me a file, please ;-)") xml, desired_pages_fn, desired_words_fn = files[0:3] threshold = float(files[3]) desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)] lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,'+ \ 'minor,timestamp,redirect,text') src.close() src = deflate(xml) processor = HistoryWordsPageProcessor(tag=tag, lang=lang) processor.talkns = translation['Talk'] processor.threshold = threshold processor.set_desired_from_csv(desired_pages_fn) processor.words = desired_words print "BEGIN PARSING" with Timr('Parsing'): processor.start(src)
def main(): import optparse import csv p = optparse.OptionParser(usage="usage: %prog [options] file desired_list acceptance_ratio") p.add_option("-v", action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option( "-e", "--encoding", action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file" ) opts, files = p.parse_args() if opts.verbose: import sys, logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) if len(files) != 3: p.error("Wrong parameters") xml = files[0] desired_pages_fn = files[1] threshold = float(files[2]) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags="page,title,revision," + "minor,timestamp,redirect,ip,username") src.close() src = deflate(xml) processor = HistoryEventsPageProcessor(tag=tag, lang=lang) processor.talkns = translation["Talk"] processor.threshold = threshold processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding) with Timr("Retrieving bots"): processor.set_bots() print "BEGIN PARSING" with Timr("Parsing"): processor.start(src) processor.flush()
def main(): from functools import partial import optparse from operator import itemgetter p = optparse.OptionParser( usage="usage: %prog [options] current_dump rich_graph" ) _, files = p.parse_args() if len(files) != 2: p.error("Give me a file, please ;-)") xml_filename = files[0] rich_fn = files[1] global lang_user_talk, lang_user, tag, templates src = BZ2File(xml_filename) tag = mwlib.get_tags(src) translations = mwlib.get_translations(src) lang_user, lang_user_talk = translations['User'], translations['User talk'] assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" user_classes = dict(sg_load(rich_fn).get_user_class('username', ('anonymous', 'bot', 'bureaucrat','sysop'))) p = Process(target=get_freq_dist, args=(queue, done_queue)) p.start() ## XML Reader Process partial_process_page = partial(process_page, queue=queue) mwlib.fast_iter(etree.iterparse(src, tag=tag['page']), partial_process_page) print >>sys.stderr, "end of XML processing" queue.put(None) ## this STOPS the process templates = done_queue.get() p.join() for k, v in sorted(templates.items(), key=itemgetter(1), reverse=True): print v, k.encode('utf-8')
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] file desired_list acceptance_ratio") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") opts, files = p.parse_args() if opts.verbose: import logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) if not files: p.error("Give me a file, please ;-)") xml, desired_pages_fn, desired_words_fn = files[0:3] threshold = float(files[3]) desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)] lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,'+ \ 'minor,timestamp,redirect,text') src.close() src = deflate(xml) processor = HistoryWordsPageProcessor(tag=tag, lang=lang) processor.talkns = translation['Talk'] processor.threshold = threshold processor.set_desired_from_csv(desired_pages_fn) processor.words = desired_words print "BEGIN PARSING" with Timr('Parsing'): processor.start(src)
def main(): logging.basicConfig(#filename="random_page_extractor.log", stream=sys.stderr, level=logging.DEBUG) op = create_option_parser() args = op.parse_args() with open(args.desired_pages_fn, 'rb') as f: desired_pages = [l[0].decode('latin-1') for l in csv.reader(f) if l and not l[0][0] == '#'] lang, date_, type_ = explode_dump_filename(args.xml_fn) deflate, _lineno = lib.find_open_for_this_file(args.xml_fn) dumps_checker(args, type_) logging.info('---------------------START---------------------') if _lineno: src = deflate(args.xml_fn, 51) else: src = deflate(args.xml_fn) translation = get_translations(src) tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp') src.close() src = deflate(args.xml_fn) output = open(args.output, 'w') if args.output else None processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang, output=output, threshold=args.ratio, min_text=args.min_text_length, n_users=args.editors_number, start_revision=args.initial_revision) processor.talkns = translation['Talk'] processor.desired_page_type = args.type processor.set_desired(desired_pages) with Timr('processing'): processor.start(src)
def main(): logging.basicConfig(#filename="random_page_extractor.log", stream=sys.stderr, level=logging.DEBUG) op = create_option_parser() args = op.parse_args() lang, date_, type_ = explode_dump_filename(args.xml_fn) deflate, _lineno = lib.find_open_for_this_file(args.xml_fn) dumps_checker(args, type_) logging.info('---------------------START---------------------') if _lineno: src = deflate(args.xml_fn, 51) else: src = deflate(args.xml_fn) translation = get_translations(src) tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp') src.close() src = deflate(args.xml_fn) output = open(args.output, 'w') if args.output else None processor = HistoryRevisionsPageProcessor( tag=tag, lang=lang, output=output, threshold=args.ratio, min_text=args.min_text_length, min_revisions=args.revisions_number, n_users=args.editors_number, start_revision=args.initial_revision) processor.talkns = translation['Talk'] processor.desired_page_type = args.type processor.set_desired_from_csv(desired_pages_fn, encoding=args.encoding) with Timr('processing'): processor.start(src)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] file desired_list acceptance_ratio") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") opts, files = p.parse_args() if opts.verbose: import logging logging.basicConfig(stream=sys.stderr,level=logging.DEBUG) if not files: p.error("Error: No file received.") xml, desired_pages_fn, desired_words_fn = files[0:3] threshold = float(files[3]) desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)] lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,minor,timestamp,redirect,text') src.close() src = deflate(xml) analyzer = EditsAnalyzer(tag=tag, lang=lang) analyzer.set_desired_from_csv(desired_pages_fn) analyzer.words = desired_words with Timr('Analyzing...'): analyzer.start(src)
def main(): import optparse p = optparse.OptionParser(usage="usage: %prog [options] input_file geoip_db output_file") p.add_option("-v", action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig( stream=sys.stderr, level=logging.DEBUG, format="%(asctime)s %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) xml = files[0] geoip_db = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags="page,redirect,timestamp,ip,revision,title") src.close() src = deflate(xml) processor = CountriesPageProcessor(tag=tag, lang=lang, output=output, userns=translation["User"], geoip=geoip_db) with Timr("Processing"): processor.start(src) ## PROCESSING processor.flush()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file desired_list output_file") p.add_option('-t', '--type', action="store", dest="type", default="all", help="Type of page to analize (content|talk|all)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") xml = files[0] desired_pages_fn = files[1] output = files[2] with open(desired_pages_fn, 'rb') as f: desired_pages = [l[0].decode('latin-1') for l in csv.reader(f) if l and not l[0][0] == '#'] lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect') src.close() src = deflate(xml) processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang, output=output) processor.talkns = translation['Talk'] processor.desired_page_type = opts.type processor.set_desired(desired_pages) processor.start(src) processor.flush()
def main(): opts, args = opt_parse() xml = args[0] if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logging.info('---------------------START---------------------') ## SET UP FOR PROCESSING lang, date_, type_ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) welcome = defaultdict(str) welcome.update({'it': r'Benvenut', 'en': r'Welcome'}) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) tag = mwlib.get_tags(src, tags='page,title,revision,timestamp,contributor,' 'username,ip,comment,id') translations = mwlib.get_translations(src) try: lang_user = unicode(translations['User']) lang_user_talk = unicode(translations['User talk']) except UnicodeDecodeError: lang_user = smart_str(translations['User']) lang_user_talk = smart_str(translations['User talk']) assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" src.close() src = deflate(xml) processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk")) processor.time_start = opts.start processor.time_end = opts.end processor.welcome_pattern = welcome[lang] with Timr('Processing'): processor.start(src) ## PROCESSING with Timr('Getting network'): g = processor.get_network() logging.info("Nodes: %d", len(g.vs)) logging.info("Edges: %d", len(g.es)) with Timr('Saving graph'): save_graph(g, lang, type_, date_)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file dictionary output_file") p.add_option('-t', '--type', action="store", dest="type", default="all", help="Type of page to analize (content|talk|all)") p.add_option('-e', '--encoding', action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-T', "--timeout", action="store", dest="timeout", type=float, default=0.5, help="Diff timeout (default=0.5, 0=no timeout)") p.add_option('-c', '--clean', action="store_true", dest="clean", default=False, help="Cleans HTML, wiki syntax, acronyms and emoticons") p.add_option('-C', '--charlimit', action="store", dest="charlimit", type="int", default=100000, help="Maximim characters per line (default=100000)") p.add_option('-r', action="store_true", dest="regex", default=False, help="Use a dictionary composed by regex (default=false)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] dic = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect') namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)] src.close() src = deflate(xml) out = open(output, 'w') processor = PyWCProcessor(tag=tag, lang=lang, dic=dic, output=out, userns=translation['User']) processor.namespaces = namespaces if opts.type == 'talk': processor.get_articles = False elif opts.type == 'content': processor.get_talks = False processor.diff_timeout = opts.timeout processor.clean = opts.clean processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean with Timr('Processing'): processor.start(src) ## PROCESSING processor.flush() out.close()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dump enriched_pickle" ) _, args = p.parse_args() if len(args) != 2: p.error("Too few or too many arguments") xml, rich_fn = args global lang_user_talk, lang_user, tag, user_classes ## pipe to send data to the subprocess p_receiver, p_sender = Pipe(duplex=False) ## pipe to get elaborated data from the subprocess done_p_receiver, done_p_sender = Pipe(duplex=False) src = BZ2File(xml) tag = mwlib.get_tags(src) lang, date, _ = mwlib.explode_dump_filename(xml) g = sg_load(rich_fn) user_classes = dict(g.get_user_class('username', ('anonymous', 'bot', 'bureaucrat', 'sysop'))) p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender)) p.start() translations = mwlib.get_translations(src) lang_user, lang_user_talk = translations['User'], translations['User talk'] assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" ## open with a faster decompressor (but that probably cannot seek) src.close() src = lib.BZ2FileExt(xml, parallel=False) partial_process_page = partial(process_page, send=p_sender) mwlib.fast_iter(etree.iterparse(src, tag=tag['page']), partial_process_page) logging.info('Users missing in the rich file: %d', count_missing) p_sender.send(0) # this STOPS the process print >> sys.stderr, "end of parsing" ## SAVE DATA g.set_weighted_degree() users_cache = {} # get a list of pair (class name, frequency distributions) for cls, fd in done_p_receiver.recv(): with open("%swiki-%s-words-%s.dat" % (lang, date, cls.replace(' ', '_')), 'w') as out: # users in this group try: users = users_cache[cls] except KeyError: users = get_class(g, cls) users_cache[cls] = users print >> out, '#users: ', len(users) print >> out, '#msgs: ', sum(users['weighted_indegree']) for k, v in fd: print >> out, v, k del fd for cls, counters in done_p_receiver.recv(): with open("%swiki-%s-smile-%s.dat" % (lang, date, cls.replace(' ', '_')), 'w') as out: # users in this group try: users = users_cache[cls] except KeyError: users = get_class(g, cls) users_cache[cls] = users print >> out, '#users: ', len(users) print >> out, '#msgs: ', sum(users['weighted_indegree']) for k, v in counters: print >> out, v, k del counters p.join() print >> sys.stderr, "end of FreqDist"
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file geoip_db output_file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-p', '--per-page', action="store", dest="per_page_stats", help="Per page stats output") p.add_option('-e', '--min-edits', action="store", type=int, dest="min_edits", help="Skip if page has less than min-edit edits") p.add_option('-a', '--min-anon', action="store", type=int, dest="min_anon", help="Skip if page has less than min-anon anonymous edits") p.add_option('-E', '--exclude', action="store", dest="exclude_countries", help="Countries to exclude, colon (;) separated") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] geoip_db = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,redirect,timestamp,ip,revision,title') src.close() src = deflate(xml) processor = CountriesPageProcessor(tag=tag, lang=lang, output=output, userns=translation['User'], geoip=geoip_db) if opts.per_page_stats: processor.per_page_stats = opts.per_page_stats if opts.exclude_countries: processor.exclude_countries = opts.exclude_countries.split(";") processor.min_edits = opts.min_edits processor.min_anon = opts.min_anon with Timr('Processing'): processor.start(src) # PROCESSING processor.flush()
def main(): import optparse p = optparse.OptionParser(usage="usage: %prog file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-s', action="store", dest="signature", default=None, help="Signature in this language (e.g. sig, firma..)") opts, files = p.parse_args() if opts.verbose: import sys import logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') try: xml = files[0] except IndexError: p.error("Give me one file, please") en_user, en_user_talk = u"User", u"User talk" lang, date, type_ = mwlib.explode_dump_filename(xml) src = BZ2File(xml) tag = mwlib.get_tags(src) ns_translation = mwlib.get_translations(src) lang_user, lang_user_talk = ns_translation['User'], \ ns_translation['User talk'] assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" lang_user = unicode(lang_user, "utf8") en_user = unicode(en_user) # open dump with an external process to use multiple cores _fast = True if _fast: src.close() src = lib.BZ2FileExt(xml) if opts.signature is not None: processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag, user_talk_names=(lang_user_talk, en_user_talk), search=(lang_user, en_user), lang=lang, signature=opts.signature) else: processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag, user_talk_names=(lang_user_talk, en_user_talk), search=(lang_user, en_user), lang=lang) with Timr('Processing'): processor.start(src) with Timr('Create network'): g = processor.ecache.get_network() logging.info("Len:", len(g.vs)) logging.info("Edges:", len(g.es)) g.write("%swiki-%s%s.pickle" % (lang, date, type_), format="pickle")
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file geoip_db output_file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-p', '--per-page', action="store", dest="per_page_stats", help="Per page stats output") p.add_option('-e', '--min-edits', action="store", type=int, dest="min_edits", help="Skip if page has less than min-edit edits") p.add_option('-a', '--min-anon', action="store", type=int, dest="min_anon", help="Skip if page has less than min-anon anonymous edits") p.add_option('-E', '--exclude', action="store", dest="exclude_countries", help="Countries to exclude, colon (;) separated") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] geoip_db = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,redirect,timestamp,ip,revision,title') src.close() src = deflate(xml) processor = CountriesPageProcessor(tag=tag, lang=lang, output=output, userns=translation['User'], geoip=geoip_db ) if opts.per_page_stats: processor.per_page_stats = opts.per_page_stats if opts.exclude_countries: processor.exclude_countries = opts.exclude_countries.split(";") processor.min_edits = opts.min_edits processor.min_anon = opts.min_anon with Timr('Processing'): processor.start(src) # PROCESSING processor.flush()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file desired_list output_file") p.add_option('-t', '--type', action="store", dest="type", default="all", help="Type of page to analize (content|talk|all)") p.add_option('-e', '--encoding', action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-T', "--timeout", action="store", dest="timeout", type=float, default=0.5, help="Diff timeout (default=0.5, 0=no timeout)") p.add_option('-c', '--clean', action="store_true", dest="clean", default=False, help="Cleans HTML, wiki syntax, acronyms and emoticons") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] desired_pages_fn = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect') src.close() src = deflate(xml) out = open(output, 'w') processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang, output=out, userns=translation['User']) processor.talkns = translation['Talk'] if opts.type == 'talk': processor.get_articles = False elif opts.type == 'content': processor.get_talks = False processor.diff_timeout = opts.timeout processor.clean = opts.clean processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding) with Timr('Processing'): processor.start(src) # PROCESSING processor.flush() out.close()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dump enriched_pickle") _, args = p.parse_args() if len(args) != 2: p.error("Too few or too many arguments") xml, rich_fn = args global lang_user_talk, lang_user, tag, user_classes ## pipe to send data to the subprocess p_receiver, p_sender = Pipe(duplex=False) ## pipe to get elaborated data from the subprocess done_p_receiver, done_p_sender = Pipe(duplex=False) src = BZ2File(xml) tag = mwlib.get_tags(src) lang, date, _ = mwlib.explode_dump_filename(xml) g = sg_load(rich_fn) user_classes = dict( g.get_user_class('username', ('anonymous', 'bot', 'bureaucrat', 'sysop'))) p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender)) p.start() translations = mwlib.get_translations(src) lang_user, lang_user_talk = translations['User'], translations['User talk'] assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" ## open with a faster decompressor (but that probably cannot seek) src.close() src = lib.BZ2FileExt(xml, parallel=False) partial_process_page = partial(process_page, send=p_sender) mwlib.fast_iter(etree.iterparse(src, tag=tag['page']), partial_process_page) logging.info('Users missing in the rich file: %d', count_missing) p_sender.send(0) # this STOPS the process print >> sys.stderr, "end of parsing" ## SAVE DATA g.set_weighted_degree() users_cache = {} # get a list of pair (class name, frequency distributions) for cls, fd in done_p_receiver.recv(): with open( "%swiki-%s-words-%s.dat" % (lang, date, cls.replace(' ', '_')), 'w') as out: # users in this group try: users = users_cache[cls] except KeyError: users = get_class(g, cls) users_cache[cls] = users print >> out, '#users: ', len(users) print >> out, '#msgs: ', sum(users['weighted_indegree']) for k, v in fd: print >> out, v, k del fd for cls, counters in done_p_receiver.recv(): with open( "%swiki-%s-smile-%s.dat" % (lang, date, cls.replace(' ', '_')), 'w') as out: # users in this group try: users = users_cache[cls] except KeyError: users = get_class(g, cls) users_cache[cls] = users print >> out, '#users: ', len(users) print >> out, '#msgs: ', sum(users['weighted_indegree']) for k, v in counters: print >> out, v, k del counters p.join() print >> sys.stderr, "end of FreqDist"
def main(): import optparse from sonet.lib import SonetOption p = optparse.OptionParser( usage="usage: %prog [options] input_file dictionary output_file", option_class=SonetOption ) p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-T', "--timeout", action="store", dest="timeout", type=float, default=0.5, help="Diff timeout (default=0.5, 0=no timeout)") p.add_option('-c', '--clean', action="store_true", dest="clean", default=False, help="Cleans HTML, wiki syntax, acronyms and emoticons") p.add_option('-S', '--detailed-start', action="store", dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Detailed output start date") p.add_option('-E', '--detailed-end', action="store", dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Detailed output end date") p.add_option('-n', '--detailed-namespace', action="store", dest="detailed_ns", default="Normal", help="Namespace of desired detailed data (default: Normal)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] dic = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,' 'contributor,username,ip')) namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)] src.close() src = deflate(xml) if os.path.exists(output): logging.error("File %s already exists!", output) sys.exit(0) out = open(output, 'w') processor = PyWCProcessor(tag=tag, lang=lang, dic=dic, output=out, userns=translation['User']) processor.namespaces = namespaces processor.diff_timeout = opts.timeout processor.clean = opts.clean processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean if opts.detailed_start and opts.detailed_end: print """ You are going to run the script with detailed output on %d days. This is going to produce some CSV files on your disk, one for each day. Is this want you really want to do? [press enter to continue] """ % (opts.detailed_end - opts.detailed_start).days raw_input() processor.pywc.detailed = True processor.detailed_start = opts.detailed_start processor.detailed_end = opts.detailed_end processor.detailed_ns = opts.detailed_ns with Timr('Processing'): processor.start(src) # PROCESSING processor.flush() out.close()
def main(): import optparse from sonet.lib import SonetOption p = optparse.OptionParser( usage="usage: %prog [options] file desired_list acceptance_ratio", option_class=SonetOption ) p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-E', '--encoding', action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file") p.add_option('-d', '--delimiter', action="store", dest="delimiter", default=",", help="CSV delimiter") p.add_option('-s', '--start', action="store", dest='start', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Look for revisions starting from this date") p.add_option('-e', '--end', action="store", dest='end', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Look for revisions until this date") opts, files = p.parse_args() if opts.verbose: import sys import logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) if len(files) != 3: p.error("Wrong parameters") xml = files[0] desired_pages_fn = files[1] threshold = float(files[2]) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,' + \ 'minor,timestamp,redirect,ip,username') src.close() src = deflate(xml) processor = HistoryEventsPageProcessor(tag=tag, lang=lang) processor.talkns = translation['Talk'] processor.threshold = threshold processor.start_date = opts.start processor.end_date = opts.end processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding, delimiter=opts.delimiter) with Timr('Retrieving bots'): processor.set_bots() print "BEGIN PARSING" with Timr('Parsing'): processor.start(src) processor.flush()