def main(): import optparse import csv p = optparse.OptionParser( usage="usage: %prog [options] file desired_list acceptance_ratio") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") opts, files = p.parse_args() if opts.verbose: import sys, logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) if len(files) != 3: p.error("Wrong parameters") xml = files[0] desired_pages_fn = files[1] threshold = float(files[2]) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) with open(desired_pages_fn, 'rb') as f: desired_pages = [ l[0].decode('latin-1') for l in csv.reader(f) if l and not l[0][0] == '#' ] if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,'+ \ 'minor,timestamp,redirect,ip,username') src.close() src = deflate(xml) processor = HistoryEventsPageProcessor(tag=tag, lang=lang) processor.talkns = translation['Talk'] processor.threshold = threshold processor.set_desired(desired_pages) with Timr('Retrieving bots'): processor.set_bots() print "BEGIN PARSING" with Timr('Parsing'): processor.start(src) processor.flush()
def save_graph(g, lang, type_, date_): counter = 0 with Timr('Setting weight attribute on edges'): for e in g.es: e['weight'] = len(e['timestamp']) #e['timestamp'] = str(e['timestamp']) counter += 1 if not counter % 10000: logging.debug(counter) with Timr('Pickling'): g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
def main(): opts, args = opt_parse() xml = args[0] ## SET UP FOR PROCESSING lang, date_, type_ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) tag = mwlib.get_tags( src, tags='page,title,revision,timestamp,contributor,username,ip,comment') translations = mwlib.get_translations(src) lang_user = unicode(translations['User']) lang_user_talk = unicode(translations['User talk']) assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" src.close() print >> sys.stderr, "BEGIN PARSING" src = deflate(xml) processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk")) processor.time_start = opts.start processor.time_end = opts.end ##TODO: only works on it.wikipedia.org! :-) processor.welcome_pattern = r'Benvenut' with Timr('Processing'): processor.start(src) ## PROCESSING with Timr('EdgeCache.get_network()'): g = processor.get_network() print >> sys.stderr, "Nodes:", len(g.vs) print >> sys.stderr, "Edges:", len(g.es) for e in g.es: e['weight'] = len(e['timestamp']) #e['timestamp'] = str(e['timestamp']) with Timr('Pickling'): g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
def main(): logging.basicConfig( #filename="graph_longiudinal_analysis.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') op = create_option_parser() args = op.parse_args() ## explode dump filename in order to obtain wiki lang, dump date and type _, date_, _ = mwlib.explode_dump_filename(args.file_name) fn, start, tw = args.file_name, args.start, args.time_window ## if end argument is not specified, then use the dump date end = args.end if args.end else lib.yyyymmdd_to_datetime(date_) ## frequency not to be considered in case of cumulative analysis freq = args.frequency if (args.frequency and not args.cumulative) else tw if args.cumulative: logging.info("Cumulative longitudinal analysis chosen," "hence not considering following option: frequency") with Timr("RUNNING ANALYSIS"): if args.cumulative: cumulative_analysis(fn, start, end, freq) else: time_slice_analysis(fn, start, end, freq, tw)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file gender_file output_file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-e', '--min-edits', default=0, dest="min_edits", metavar="MIN_EDITS", type=int, help="pages with less than MIN_EIDTS edits " "are skipped (default: %(default)s)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] gender_data = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags="page,redirect,timestamp,ip," "contributor,title,username") src.close() src = deflate(xml) out = open(output, "w") processor = GenderPageProcessor(tag=tag, lang=lang, output=out, userns=translation['User'], gender_data=gender_data, min_edits=opts.min_edits) with Timr('Processing'): processor.start(src) # PROCESSING processor.flush() out.close()
def process_page(self, _): self.count += 1 if not self.count % 1000: logging.info(' ### Processed %d pages', self.count) self.delattr(("text")) if not self._skip: with Timr('Flushing %s' % self._title): self.flush() self._skip = False
def graph_loader(file_name): """ Loads a sonet.graph object from a pickle/graphml/... file """ try: with Timr("GRAPH LOADING"): return sg.load(file_name) except IOError: logging.exception("unable to load a graph from passed file: %s", file_name) sys.exit()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] dic_file input_file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-p', action="store_true", dest="percentage", default=False, help="Output results as percentages (like LIWC) " "(default=false)") p.add_option('-c', '--charlimit', action="store", dest="charlimit", type="int", default=100000, help="Maximim characters per line (default=100000)") p.add_option('-i', '--ignorecols', action="store", dest="ignorecols", help="Coulmns numbers of the source file to ignore" + \ "(comma separated and starting from 0)") p.add_option('-I', '--id', action="store", dest="id_col", type="int", help="Id column number (starting from 0)", default=0) p.add_option('-r', action="store_true", dest="regex", default=False, help="Use a dictionary composed by regex (default=false)") p.add_option('-f', "--flush", action="store", dest="flush", type="int", default=100, help="Flushing to output every N pieces of text") p.add_option("--clean", action="store_true", dest="clean", default=False, help="Clean text from wiki syntax/HTML") p.add_option('-o', "--output", action="store", dest="output", help="Output file (default=STDOUT)") opts, files = p.parse_args() if len(files) != 2: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') t = PyWC() t.max_char_limit = opts.charlimit t.clean_wiki = t.clean_html = opts.clean if opts.ignorecols: t.ignorecols = [int(x) for x in opts.ignorecols.split(",")] t.id_col = opts.id_col t.dic_regex = opts.regex t.flush_n = opts.flush if opts.output is not None: t.csv_out = open(opts.output, 'w') t.percentage = opts.percentage t.set_dic(files[0]) src = open(files[1], 'r') with Timr("Processing"): t.start(src) t.flush()
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] file desired_list acceptance_ratio") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") opts, files = p.parse_args() if opts.verbose: import logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG) if not files: p.error("Give me a file, please ;-)") xml, desired_pages_fn, desired_words_fn = files[0:3] threshold = float(files[3]) desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)] lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,'+ \ 'minor,timestamp,redirect,text') src.close() src = deflate(xml) processor = HistoryWordsPageProcessor(tag=tag, lang=lang) processor.talkns = translation['Talk'] processor.threshold = threshold processor.set_desired_from_csv(desired_pages_fn) processor.words = desired_words print "BEGIN PARSING" with Timr('Parsing'): processor.start(src)
def main(): logging.basicConfig( #filename="usercontributions.log", stream=sys.stderr, level=logging.DEBUG) logging.info('---------------------START---------------------') receiver, sender = Pipe(duplex=False) opts, args = opt_parse() xml = args[0] ## SET UP FOR PROCESSING lang, _, _ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) tag = mwlib.get_tags(src, tags='page,title,revision,timestamp,contributor,username,ip'+ \ ',comment,id,minor') namespaces = [(0, "Normal")] + mwlib.get_namespaces(src) src.close() logging.info("BEGIN PARSING") src = deflate(xml) processor = UserContributionsPageProcessor(tag=tag, lang=lang) processor.sender = sender processor.namespaces = namespaces processor.time_end = opts.end ##TODO: only works on it.wikipedia.org! :-) processor.welcome_pattern = r'Benvenut' p = Process(target=use_contrib_dict, args=(receiver, processor.namespaces, lang)) p.start() with Timr('PROCESSING'): processor.start(src) ## PROCESSING sender.send(None) p.join() ## wait until save is complete
def main(): logging.basicConfig(#filename="random_page_extractor.log", stream=sys.stderr, level=logging.DEBUG) op = create_option_parser() args = op.parse_args() lang, date_, type_ = explode_dump_filename(args.xml_fn) deflate, _lineno = lib.find_open_for_this_file(args.xml_fn) dumps_checker(args, type_) logging.info('---------------------START---------------------') if _lineno: src = deflate(args.xml_fn, 51) else: src = deflate(args.xml_fn) translation = get_translations(src) tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp') src.close() src = deflate(args.xml_fn) output = open(args.output, 'w') if args.output else None processor = HistoryRevisionsPageProcessor( tag=tag, lang=lang, output=output, threshold=args.ratio, min_text=args.min_text_length, min_revisions=args.revisions_number, n_users=args.editors_number, start_revision=args.initial_revision) processor.talkns = translation['Talk'] processor.desired_page_type = args.type processor.set_desired_from_csv(desired_pages_fn, encoding=args.encoding) with Timr('processing'): processor.start(src)
def cumulative_analysis(fn, start, end, freq): logging.info("running cumulative analysis") graph = graph_loader(fn) ## loading graph freq_range = int(ceil(((end - start).days + 1) / float(freq))) for d in range(freq_range): s, e = start, end - timedelta(d * freq) if e <= s: continue ## processing if s != start or e != end: with Timr("SUBGRAPHING"): process_graph(graph, s, e) ## printing stats print_graph_stats(graph.g) del graph return
def main(): op = create_option_parser() (options, args) = op.parse_args() if len(args) != 1: print "Insert one (and only one) file to process\n" op.print_help() sys.exit(2) fn = args[0] lang, date, type_ = mwlib.explode_dump_filename(fn) g = sg.load(fn) g.time_slice_subgraph(start=options.start, end=options.end) g.invert_edge_attr('weight', 'length') vn = len(g.g.vs) # number of vertexes en = len(g.g.es) # number of edges timr = Timr() if options.as_table: tablr = Tablr() tablr.start(1024*32, lang) if options.group or options.users_role or options.histogram: for group_name, group_attr in groups.iteritems(): g.defineClass(group_name, group_attr) print ' * %s : nodes number : %d' % (group_name, len(g.classes[group_name])) else: g.defineClass('all', {}) print " * filename: %s" % (fn,) print " * lang: %s" % (lang,) print " * date: %s" % (date,) if options.details: with Timr("details"): print " * nodes number: %d" % (vn,) print " * edges number: %d" % (en,) nodes_with_outdegree = len(g.g.vs.select(_outdegree_ge=1)) nodes_with_indegree = len(g.g.vs.select(_indegree_ge=1)) self_loop_edges = len([edge for edge in g.g.es \ if edge.target == edge.source]) print " * nodes with out edges number: %d (%6f%%)" % ( nodes_with_outdegree, 100.*nodes_with_outdegree/vn) print " * nodes with in edges number: %d (%6f%%)" % ( nodes_with_indegree, 100.*nodes_with_indegree/vn) print " * max weights on edges : %s" % top(g.g.es['weight']) print " * self-loop edges: %d" % self_loop_edges #print " * diameter : %6f" % g.g.diameter(weights='length') #print " * average weight : %6f" % numpy.average(g.g.es['weight']) if options.density or options.reciprocity: with Timr('density&reciprocity'): for cls, vs in g.classes.iteritems(): if not len(vs) > 1: continue subgraph = vs.subgraph() print " * %s : density : %.10f" % (cls, subgraph.density()) print " * %s : reciprocity : %.10f" % (cls, subgraph.reciprocity()) if options.degree: with Timr('degree'): g.g.vs['indegree'] = g.g.degree(type=ig.IN) g.g.vs['outdegree'] = g.g.degree(type=ig.OUT) for cls, vs in g.classes.iteritems(): if not vs: continue ind = numpy.array(vs['indegree']) outd = numpy.array(vs['outdegree']) print " * %s : mean IN degree (no weights): %f" % ( cls, numpy.average(ind)) print " * %s : mean OUT degree (no weights): %f" % ( cls, numpy.average(outd)) print " * %s : max IN degrees (no weights): %s" % (cls, top(ind)) print " * %s : max OUT degrees (no weights): %s" % (cls, top(outd)) print " * %s : stddev IN degree (no weights): %f" % ( cls, numpy.sqrt(numpy.var(ind))) print " * %s : stddev OUT degree (no weights): %f" % ( cls, numpy.sqrt(numpy.var(outd))) if options.transitivity: ##print " * transitivity: %f" % (nx.transitivity(g), ) pass if options.summary: # don't use with --as-table print " * summary: %s" % (g.g.summary(), ) if options.distance: with Timr('split clusters'): vc = g.g.clusters() size_clusters = vc.sizes() giant = vc.giant() print " * length of 5 max clusters: %s" % top(size_clusters) #print " * #node in 5 max clusters/#all nodes: %s" % top( # [1.*cluster_len/vn for cluster_len in size_clusters]) if options.distance: with Timr('distance'): gg = sg.Graph(giant) print " * average distance in the giant component: %f" % \ gg.averageDistance(weight='length') print " * average hops in the giant component: %f" % \ gg.averageDistance() #print "Average distance 2: %f" % giant.average_path_length(True, # False) if options.efficiency: with Timr('efficiency'): print " * efficiency: %f" % g.efficiency(weight='length') ##TODO: compute for centrality only if "all" or "degree" if (options.plot or options.histogram or options.power_law or options.centrality): with Timr('set weighted indegree'): g.set_weighted_degree() if options.centrality: timr.start('centrality') centralities = options.centrality.split(',') if 'all' in centralities: centralities = 'betweenness,pagerank,degree'.split(',') if set(centralities).difference( 'betweenness,pagerank,degree'.split(',')): logging.error('Unknown centrality') sys.exit(0) if "betweenness" in centralities: print >> sys.stderr, "betweenness" g.g.vs['bw'] = g.g.betweenness(weights='length', directed = True) #g.g.vs['ev'] = g.g.evcent(weights='weight') # eigenvector centrality if 'pagerank' in centralities: print >> sys.stderr, "pagerank" g.g.vs['pr'] = g.g.pagerank(weights='weight') # pagerank if 'degree' in centralities: print >> sys.stderr, "outdegree" g.set_weighted_degree(type=ig.OUT) #total_weights = sum(g.g.es['weight']) max_edges = vn*(vn-1) for cls, vs in g.classes.iteritems(): if not vs: continue if "betweenness" in centralities: norm_betweenness = numpy.array(g.classes[cls]['bw'])/max_edges print " * %s : average betweenness : %.10f" % ( cls, numpy.average(norm_betweenness)) print " * %s : stddev betweenness : %.10f" % ( cls, numpy.sqrt(numpy.var(norm_betweenness))) print " * %s : max betweenness: %s" % ( cls, top(numpy.array(g.classes[cls]['bw'])/max_edges)) #print " * Average eigenvector centrality : %6f" % numpy.average( # g.vs['ev']) if 'pagerank' in centralities: print " * %s : average pagerank : %.10f" % ( cls, numpy.average(g.classes[cls]['pr'])) print " * %s : stddev pagerank : %.10f" % ( cls, numpy.sqrt(numpy.var(g.classes[cls]['pr']))) print " * %s : max pagerank: %s" % ( cls, top(g.classes[cls]['pr'])) if 'degree' in centralities: wi = g.classes[cls]['weighted_indegree'] print " * %s : average IN degree centrality (weighted): %.10f" % ( cls, numpy.average(wi)) print " * %s : stddev IN degree centrality (weighted): %.10f" % ( cls, numpy.sqrt(numpy.var(wi))) print " * %s : max IN degrees centrality (weighted): %s" % ( cls, top(wi)) del wi wo = g.classes[cls]['weighted_outdegree'] print " * %s : average OUT degree centrality (weighted) : %.10f" %\ (cls, numpy.average(wo)) print " * %s : stddev OUT degree centrality (weighted) : %.10f" % \ (cls, numpy.sqrt(numpy.var(wo))) print " * %s : max OUT degrees centrality (weighted): %s" % ( cls, top(wo)) del wo timr.stop('centrality') if options.power_law: with Timr('power law'): for cls, vs in g.classes.iteritems(): if not vs: continue indegrees = vs['weighted_indegree'] try: alpha_exp = ig.statistics.power_law_fit(indegrees, xmin=6) print " * %s : alpha exp IN degree distribution : %10f " %\ (cls, alpha_exp) except ValueError: print >> sys.stderr,\ " * %s : alpha exp IN degree distribution : ERROR" %\ (cls,) if options.histogram: list_with_index = lambda degrees, idx: [(degree, idx) for degree in degrees if degree] all_list = [] nogrp_indegrees = g.g.vs.select(sysop_ne=True, bureaucrat_ne=True, steward_ne=True, founder_ne=True, bot_ne=True)['weighted_indegree'] all_list += list_with_index(nogrp_indegrees, 1) sysops_indegrees = g.classes['sysop']['weighted_indegree'] all_list += list_with_index(sysops_indegrees, 2) burs_indegrees = g.classes['bureaucrat']['weighted_indegree'] all_list += list_with_index(burs_indegrees, 3) stewards_indegrees = g.classes['steward']['weighted_indegree'] all_list += list_with_index(stewards_indegrees, 4) founders_indegrees = g.classes['founder']['weighted_indegree'] all_list += list_with_index(founders_indegrees, 5) bots_indegrees = g.classes['bot']['weighted_indegree'] all_list += list_with_index(bots_indegrees, 6) if options.gnuplot: f = open('hist.dat', 'w') else: f = open('%swiki-%s-hist.dat' % (lang, date), 'w') all_list.sort(reverse=True) for indegree, grp in all_list: for _ in range(grp - 1): print >> f, 0, print >> f, indegree, for _ in range(grp, 6): print >> f, 0, print >> f, "" f.close() if options.gnuplot: from popen2 import Popen3 process = Popen3('gnuplot hist.gnuplot') process.wait() os.rename('hist.png', '%swiki-%s-hist.png' % (lang, date)) os.rename('hist.dat', '%swiki-%s-hist.dat' % (lang, date)) if options.plot: ## TODO: evaluate if this can be done with ## http://bazaar.launchpad.net/~igraph/igraph/0.6-main/revision/2018 with Timr('plot'): import math ## filter: #print len(g.g.vs), len(g.g.es) #g.set_weighted_degree(type=ig.OUT) #g.g = g.g.subgraph(g.g.vs.select(weighted_indegree_ge=10, # weighted_outdegree_ge=1)) #g.g.write_graphml('itwiki-20100729-stub-meta-history_in10_out1.graphml') #print len(g.g.vs), len(g.g.es) bots = g.g.vs.select(bot=True) bots['color'] = ('purple',)*len(bots) logging.debug('bots: ok') anonyms = g.g.vs.select(anonymous=True) anonyms['color'] = ('blue',)*len(anonyms) sysops = g.g.vs.select(sysop=True) sysops['color'] = ('yellow',)*len(sysops) bur_sysops = g.g.vs.select(bureaucrat=True, sysop=True) bur_sysops['color'] = ('orange',)*len(bur_sysops) g.g.vs['size'] = [math.sqrt(v['weighted_indegree']+1)*10 for v in g.g.vs] logging.debug('plot: begin') ig.plot(g.g, target=lang+"_general.png", bbox=(0, 0, 8000, 8000), edge_color='grey', layout='drl') logging.debug('plot: end') weights = g.g.es['weight'] max_weight = max(weights) g.g.es['color'] = [(255.*e['weight']/max_weight, 0., 0.) for e in g.g.es] g.g.es['width'] = weights ig.plot(g.g, target=lang+"_weighted_edges.png", bbox=(0, 0, 4000, 2400), layout='fr', vertex_label=' ') if options.as_table: tablr.stop() #tablr.printHeader() #tablr.printData() tablr.saveInDjangoModel() if options.adjacency: giant = g.g.clusters().giant() #destAdj = "%s/%swiki-%s-adj.csv" % (os.path.split(fn)[0], lang, date) destAdj = "%swiki-%s-adj.csv" % (lang, date) #destRec = "%s/%swiki-%s-rec.csv" % (os.path.split(fn)[0], lang, date) destRec = "%swiki-%s-rec.csv" % (lang, date) sg.Graph(giant).writeAdjacencyMatrix(destAdj, 'username') sg.Graph(giant).writeReciprocityMatrix('username', destRec) if options.users_role: l = g.get_user_class('username', ('anonymous', 'bot', 'bureaucrat', 'sysop')) #destUR = "%s/%swiki-%s-ur.csv" % (os.path.split(fn)[0], lang, date) destUR = "%swiki-%s-ur.csv" % (lang, date) with open(destUR, 'w') as f: for username, role in sorted(l): print >> f, "%s,%s" % (username, role) from random import shuffle #destCls = "%s/%swiki-%s-%%s.csv" % (os.path.split(fn)[0], lang, date) destCls = "%swiki-%s-%%s.csv" % (lang, date) for cls in ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal_user'): users = g.classes[cls]['username'] shuffle(users) with open(destCls % cls, 'w') as f: for username in users: print >> f, \ ("%s,http://vec.wikipedia.org/w/index.php?title="+\ "Discussion_utente:%s&action=history&offset="+\ "20100000000001") % (username, username)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file geoip_db output_file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-p', '--per-page', action="store", dest="per_page_stats", help="Per page stats output") p.add_option('-e', '--min-edits', action="store", type=int, dest="min_edits", help="Skip if page has less than min-edit edits") p.add_option('-a', '--min-anon', action="store", type=int, dest="min_anon", help="Skip if page has less than min-anon anonymous edits") p.add_option('-E', '--exclude', action="store", dest="exclude_countries", help="Countries to exclude, colon (;) separated") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] geoip_db = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,redirect,timestamp,ip,revision,title') src.close() src = deflate(xml) processor = CountriesPageProcessor(tag=tag, lang=lang, output=output, userns=translation['User'], geoip=geoip_db) if opts.per_page_stats: processor.per_page_stats = opts.per_page_stats if opts.exclude_countries: processor.exclude_countries = opts.exclude_countries.split(";") processor.min_edits = opts.min_edits processor.min_anon = opts.min_anon with Timr('Processing'): processor.start(src) # PROCESSING processor.flush()
def main(): import optparse p = optparse.OptionParser(usage="usage: %prog file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-s', action="store", dest="signature", default=None, help="Signature in this language (e.g. sig, firma..)") opts, files = p.parse_args() if opts.verbose: import sys import logging logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') try: xml = files[0] except IndexError: p.error("Give me one file, please") en_user, en_user_talk = u"User", u"User talk" lang, date, type_ = mwlib.explode_dump_filename(xml) src = BZ2File(xml) tag = mwlib.get_tags(src) ns_translation = mwlib.get_translations(src) lang_user, lang_user_talk = ns_translation['User'], \ ns_translation['User talk'] assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" lang_user = unicode(lang_user, "utf8") en_user = unicode(en_user) # open dump with an external process to use multiple cores _fast = True if _fast: src.close() src = lib.BZ2FileExt(xml) if opts.signature is not None: processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag, user_talk_names=(lang_user_talk, en_user_talk), search=(lang_user, en_user), lang=lang, signature=opts.signature) else: processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag, user_talk_names=(lang_user_talk, en_user_talk), search=(lang_user, en_user), lang=lang) with Timr('Processing'): processor.start(src) with Timr('Create network'): g = processor.ecache.get_network() logging.info("Len:", len(g.vs)) logging.info("Edges:", len(g.es)) g.write("%swiki-%s%s.pickle" % (lang, date, type_), format="pickle")
def main(): opts, args = opt_parse() xml = args[0] if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') logging.info('---------------------START---------------------') ## SET UP FOR PROCESSING lang, date_, type_ = mwlib.explode_dump_filename(xml) deflate, _lineno = find_open_for_this_file(xml) welcome = defaultdict(str) welcome.update({'it': r'Benvenut', 'en': r'Welcome'}) if _lineno: src = deflate(xml, 51) # Read first 51 lines to extract namespaces else: src = deflate(xml) tag = mwlib.get_tags(src, tags='page,title,revision,timestamp,contributor,' 'username,ip,comment,id') translations = mwlib.get_translations(src) try: lang_user = unicode(translations['User']) lang_user_talk = unicode(translations['User talk']) except UnicodeDecodeError: lang_user = smart_str(translations['User']) lang_user_talk = smart_str(translations['User talk']) assert lang_user, "User namespace not found" assert lang_user_talk, "User Talk namespace not found" src.close() src = deflate(xml) processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk")) processor.time_start = opts.start processor.time_end = opts.end processor.welcome_pattern = welcome[lang] with Timr('Processing'): processor.start(src) ## PROCESSING with Timr('Getting network'): g = processor.get_network() logging.info("Nodes: %d", len(g.vs)) logging.info("Edges: %d", len(g.es)) with Timr('Saving graph'): save_graph(g, lang, type_, date_)
def main(): import optparse p = optparse.OptionParser( usage="usage: %prog [options] input_file desired_list output_file") p.add_option('-t', '--type', action="store", dest="type", default="all", help="Type of page to analize (content|talk|all)") p.add_option('-e', '--encoding', action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file") p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-T', "--timeout", action="store", dest="timeout", type=float, default=0.5, help="Diff timeout (default=0.5, 0=no timeout)") p.add_option('-c', '--clean', action="store_true", dest="clean", default=False, help="Cleans HTML, wiki syntax, acronyms and emoticons") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] desired_pages_fn = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect') src.close() src = deflate(xml) out = open(output, 'w') processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang, output=out, userns=translation['User']) processor.talkns = translation['Talk'] if opts.type == 'talk': processor.get_articles = False elif opts.type == 'content': processor.get_talks = False processor.diff_timeout = opts.timeout processor.clean = opts.clean processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding) with Timr('Processing'): processor.start(src) # PROCESSING processor.flush() out.close()
def main(): op = create_option_parser() (options, args) = op.parse_args() if len(args) != 1: print "Insert one (and only one) file to process\n" op.print_help() sys.exit(2) fn = args[0] lang, date, type_ = mwlib.explode_dump_filename(fn) g = sg.load(fn) g.time_slice_subgraph(start=options.start, end=options.end) g.invert_edge_attr('weight', 'length') vn = len(g.g.vs) # number of vertexes en = len(g.g.es) # number of edges timr = Timr() if options.as_table: tablr = Tablr() tablr.start(1024 * 32, lang) if options.group or options.users_role or options.histogram: for group_name, group_attr in groups.iteritems(): g.defineClass(group_name, group_attr) print ' * %s : nodes number : %d' % (group_name, len(g.classes[group_name])) else: g.defineClass('all', {}) print " * filename: %s" % (fn, ) print " * lang: %s" % (lang, ) print " * date: %s" % (date, ) if options.details: with Timr("details"): print " * nodes number: %d" % (vn, ) print " * edges number: %d" % (en, ) nodes_with_outdegree = len(g.g.vs.select(_outdegree_ge=1)) nodes_with_indegree = len(g.g.vs.select(_indegree_ge=1)) self_loop_edges = len([edge for edge in g.g.es \ if edge.target == edge.source]) print " * nodes with out edges number: %d (%6f%%)" % ( nodes_with_outdegree, 100. * nodes_with_outdegree / vn) print " * nodes with in edges number: %d (%6f%%)" % ( nodes_with_indegree, 100. * nodes_with_indegree / vn) print " * max weights on edges : %s" % top(g.g.es['weight']) print " * self-loop edges: %d" % self_loop_edges #print " * diameter : %6f" % g.g.diameter(weights='length') #print " * average weight : %6f" % numpy.average(g.g.es['weight']) if options.density or options.reciprocity: with Timr('density&reciprocity'): for cls, vs in g.classes.iteritems(): if not len(vs) > 1: continue subgraph = vs.subgraph() print " * %s : density : %.10f" % (cls, subgraph.density()) print " * %s : reciprocity : %.10f" % (cls, subgraph.reciprocity()) if options.degree: with Timr('degree'): g.g.vs['indegree'] = g.g.degree(type=ig.IN) g.g.vs['outdegree'] = g.g.degree(type=ig.OUT) for cls, vs in g.classes.iteritems(): if not vs: continue ind = numpy.array(vs['indegree']) outd = numpy.array(vs['outdegree']) print " * %s : mean IN degree (no weights): %f" % ( cls, numpy.average(ind)) print " * %s : mean OUT degree (no weights): %f" % ( cls, numpy.average(outd)) print " * %s : max IN degrees (no weights): %s" % (cls, top(ind)) print " * %s : max OUT degrees (no weights): %s" % (cls, top(outd)) print " * %s : stddev IN degree (no weights): %f" % ( cls, numpy.sqrt(numpy.var(ind))) print " * %s : stddev OUT degree (no weights): %f" % ( cls, numpy.sqrt(numpy.var(outd))) if options.transitivity: ##print " * transitivity: %f" % (nx.transitivity(g), ) pass if options.summary: # don't use with --as-table print " * summary: %s" % (g.g.summary(), ) if options.distance: with Timr('split clusters'): vc = g.g.clusters() size_clusters = vc.sizes() giant = vc.giant() print " * length of 5 max clusters: %s" % top(size_clusters) #print " * #node in 5 max clusters/#all nodes: %s" % top( # [1.*cluster_len/vn for cluster_len in size_clusters]) if options.distance: with Timr('distance'): gg = sg.Graph(giant) print " * average distance in the giant component: %f" % \ gg.averageDistance(weight='length') print " * average hops in the giant component: %f" % \ gg.averageDistance() #print "Average distance 2: %f" % giant.average_path_length(True, # False) if options.efficiency: with Timr('efficiency'): print " * efficiency: %f" % g.efficiency(weight='length') ##TODO: compute for centrality only if "all" or "degree" if (options.plot or options.histogram or options.power_law or options.centrality): with Timr('set weighted indegree'): g.set_weighted_degree() if options.centrality: timr.start('centrality') centralities = options.centrality.split(',') if 'all' in centralities: centralities = 'betweenness,pagerank,degree'.split(',') if set(centralities).difference( 'betweenness,pagerank,degree'.split(',')): logging.error('Unknown centrality') sys.exit(0) if "betweenness" in centralities: print >> sys.stderr, "betweenness" g.g.vs['bw'] = g.g.betweenness(weights='length', directed=True) #g.g.vs['ev'] = g.g.evcent(weights='weight') # eigenvector centrality if 'pagerank' in centralities: print >> sys.stderr, "pagerank" g.g.vs['pr'] = g.g.pagerank(weights='weight') # pagerank if 'degree' in centralities: print >> sys.stderr, "outdegree" g.set_weighted_degree(type=ig.OUT) #total_weights = sum(g.g.es['weight']) max_edges = vn * (vn - 1) for cls, vs in g.classes.iteritems(): if not vs: continue if "betweenness" in centralities: norm_betweenness = numpy.array(g.classes[cls]['bw']) \ / max_edges print " * %s : average betweenness : %.10f" % ( cls, numpy.average(norm_betweenness)) print " * %s : stddev betweenness : %.10f" % ( cls, numpy.sqrt(numpy.var(norm_betweenness))) print " * %s : max betweenness: %s" % ( cls, top(numpy.array(g.classes[cls]['bw']) / max_edges)) #print " * Average eigenvector centrality : %6f" % numpy.average( # g.vs['ev']) if 'pagerank' in centralities: print " * %s : average pagerank : %.10f" % ( cls, numpy.average(g.classes[cls]['pr'])) print " * %s : stddev pagerank : %.10f" % ( cls, numpy.sqrt(numpy.var(g.classes[cls]['pr']))) print " * %s : max pagerank: %s" % (cls, top(g.classes[cls]['pr'])) if 'degree' in centralities: wi = g.classes[cls]['weighted_indegree'] print " * %s : average IN degree centrality (weighted): %.10f" % ( cls, numpy.average(wi)) print " * %s : stddev IN degree centrality (weighted): %.10f" % ( cls, numpy.sqrt(numpy.var(wi))) print " * %s : max IN degrees centrality (weighted): %s" % ( cls, top(wi)) del wi wo = g.classes[cls]['weighted_outdegree'] print " * %s : average OUT degree centrality (weighted) : %.10f" %\ (cls, numpy.average(wo)) print " * %s : stddev OUT degree centrality (weighted) : %.10f" % \ (cls, numpy.sqrt(numpy.var(wo))) print " * %s : max OUT degrees centrality (weighted): %s" % ( cls, top(wo)) del wo timr.stop('centrality') if options.power_law: with Timr('power law'): for cls, vs in g.classes.iteritems(): if not vs: continue indegrees = vs['weighted_indegree'] try: alpha_exp = ig.statistics.power_law_fit(indegrees, xmin=6) print " * %s : alpha exp IN degree distribution : %10f " %\ (cls, alpha_exp) except ValueError: print >> sys.stderr,\ " * %s : alpha exp IN degree distribution : ERROR" %\ (cls,) if options.histogram: list_with_index = lambda degrees, idx: [(degree, idx) for degree in degrees if degree] all_list = [] nogrp_indegrees = g.g.vs.select(sysop_ne=True, bureaucrat_ne=True, steward_ne=True, founder_ne=True, bot_ne=True)['weighted_indegree'] all_list += list_with_index(nogrp_indegrees, 1) sysops_indegrees = g.classes['sysop']['weighted_indegree'] all_list += list_with_index(sysops_indegrees, 2) burs_indegrees = g.classes['bureaucrat']['weighted_indegree'] all_list += list_with_index(burs_indegrees, 3) stewards_indegrees = g.classes['steward']['weighted_indegree'] all_list += list_with_index(stewards_indegrees, 4) founders_indegrees = g.classes['founder']['weighted_indegree'] all_list += list_with_index(founders_indegrees, 5) bots_indegrees = g.classes['bot']['weighted_indegree'] all_list += list_with_index(bots_indegrees, 6) if options.gnuplot: f = open('hist.dat', 'w') else: f = open('%swiki-%s-hist.dat' % (lang, date), 'w') all_list.sort(reverse=True) for indegree, grp in all_list: for _ in range(grp - 1): print >> f, 0, print >> f, indegree, for _ in range(grp, 6): print >> f, 0, print >> f, "" f.close() if options.gnuplot: from popen2 import Popen3 process = Popen3('gnuplot hist.gnuplot') process.wait() os.rename('hist.png', '%swiki-%s-hist.png' % (lang, date)) os.rename('hist.dat', '%swiki-%s-hist.dat' % (lang, date)) if options.plot: ## TODO: evaluate if this can be done with ## http://bazaar.launchpad.net/~igraph/igraph/0.6-main/revision/2018 with Timr('plot'): import math ## filter: #print len(g.g.vs), len(g.g.es) #g.set_weighted_degree(type=ig.OUT) #g.g = g.g.subgraph(g.g.vs.select(weighted_indegree_ge=10, # weighted_outdegree_ge=1)) #g.g.write_graphml('itwiki-20100729-stub-meta-history_in10_out1.graphml') #print len(g.g.vs), len(g.g.es) bots = g.g.vs.select(bot=True) bots['color'] = ('purple', ) * len(bots) logging.debug('bots: ok') anonyms = g.g.vs.select(anonymous=True) anonyms['color'] = ('blue', ) * len(anonyms) sysops = g.g.vs.select(sysop=True) sysops['color'] = ('yellow', ) * len(sysops) bur_sysops = g.g.vs.select(bureaucrat=True, sysop=True) bur_sysops['color'] = ('orange', ) * len(bur_sysops) g.g.vs['size'] = [ math.sqrt(v['weighted_indegree'] + 1) * 10 for v in g.g.vs ] logging.debug('plot: begin') ig.plot(g.g, target=lang + "_general.png", bbox=(0, 0, 8000, 8000), edge_color='grey', layout='drl') logging.debug('plot: end') weights = g.g.es['weight'] max_weight = max(weights) g.g.es['color'] = [(255. * e['weight'] / max_weight, 0., 0.) for e in g.g.es] g.g.es['width'] = weights ig.plot(g.g, target=lang + "_weighted_edges.png", bbox=(0, 0, 4000, 2400), layout='fr', vertex_label=' ') if options.as_table: tablr.stop() #tablr.printHeader() #tablr.printData() tablr.saveInDjangoModel() if options.adjacency: giant = g.g.clusters().giant() #destAdj = "%s/%swiki-%s-adj.csv" % (os.path.split(fn)[0], lang, date) destAdj = "%swiki-%s-adj.csv" % (lang, date) #destRec = "%s/%swiki-%s-rec.csv" % (os.path.split(fn)[0], lang, date) destRec = "%swiki-%s-rec.csv" % (lang, date) sg.Graph(giant).writeAdjacencyMatrix(destAdj, 'username') sg.Graph(giant).writeReciprocityMatrix('username', destRec) if options.users_role: l = g.get_user_class('username', ('anonymous', 'bot', 'bureaucrat', 'sysop')) #destUR = "%s/%swiki-%s-ur.csv" % (os.path.split(fn)[0], lang, date) destUR = "%swiki-%s-ur.csv" % (lang, date) with open(destUR, 'w') as f: for username, role in sorted(l): print >> f, "%s,%s" % (username, role) from random import shuffle #destCls = "%s/%swiki-%s-%%s.csv" % (os.path.split(fn)[0], lang, date) destCls = "%swiki-%s-%%s.csv" % (lang, date) for cls in ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal_user'): users = g.classes[cls]['username'] shuffle(users) with open(destCls % cls, 'w') as f: for username in users: print >> f, \ ("%s,http://vec.wikipedia.org/w/index.php?title=" + \ "Discussion_utente:%s&action=history&offset=" + \ "20100000000001") % (username, username)
def main(): import optparse from sonet.lib import SonetOption p = optparse.OptionParser( usage="usage: %prog [options] input_file dictionary output_file", option_class=SonetOption ) p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output (like timings)") p.add_option('-T', "--timeout", action="store", dest="timeout", type=float, default=0.5, help="Diff timeout (default=0.5, 0=no timeout)") p.add_option('-c', '--clean', action="store_true", dest="clean", default=False, help="Cleans HTML, wiki syntax, acronyms and emoticons") p.add_option('-S', '--detailed-start', action="store", dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Detailed output start date") p.add_option('-E', '--detailed-end', action="store", dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Detailed output end date") p.add_option('-n', '--detailed-namespace', action="store", dest="detailed_ns", default="Normal", help="Namespace of desired detailed data (default: Normal)") opts, files = p.parse_args() if len(files) != 3: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') xml = files[0] dic = files[1] output = files[2] dumps_checker(xml) lang, _, _ = explode_dump_filename(xml) deflate, _lineno = lib.find_open_for_this_file(xml) if _lineno: src = deflate(xml, 51) else: src = deflate(xml) translation = get_translations(src) tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,' 'contributor,username,ip')) namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)] src.close() src = deflate(xml) if os.path.exists(output): logging.error("File %s already exists!", output) sys.exit(0) out = open(output, 'w') processor = PyWCProcessor(tag=tag, lang=lang, dic=dic, output=out, userns=translation['User']) processor.namespaces = namespaces processor.diff_timeout = opts.timeout processor.clean = opts.clean processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean if opts.detailed_start and opts.detailed_end: print """ You are going to run the script with detailed output on %d days. This is going to produce some CSV files on your disk, one for each day. Is this want you really want to do? [press enter to continue] """ % (opts.detailed_end - opts.detailed_start).days raw_input() processor.pywc.detailed = True processor.detailed_start = opts.detailed_start processor.detailed_end = opts.detailed_end processor.detailed_ns = opts.detailed_ns with Timr('Processing'): processor.start(src) # PROCESSING processor.flush() out.close()
def main(): import optparse from sonet.lib import SonetOption p = optparse.OptionParser( usage="usage: %prog [options] input_file output_file", option_class=SonetOption) p.add_option('-v', action="store_true", dest="verbose", default=False, help="Verbose output") p.add_option('-i', '--ignorecols', action="store", dest="ignorecols", help="Columns numbers of the source file to ignore" "(comma separated and starting from 0)") p.add_option('-I', '--id', action="store", dest="id_col", type="int", help="Id column number (starting from 0)", default=0) p.add_option('-o', '--onlycols', action="store", dest="onlycols", help="Select only this set of columns" + \ "(comma separated and starting from 0)") p.add_option('-p', '--percentages', action="store_true", dest="perc", help="Use percentages instead of absolute value") p.add_option('-w', '--window', action="store", dest="window", type=int, help="Collapse days") p.add_option('-g', '--group', action="store", dest="group", help="Group by weekday/month") p.add_option('-S', '--sliding', action="store", dest="smooth", type=int, help="Sliding window") p.add_option('--exclude-less-than', action="store", dest="excludelessthan", type=int, help=("Exclude lines with totals (or dic if -d option is " "used) smaller than this parameter")) p.add_option('--exclude-more-than', action="store", dest="excludemorethan", type=int, help=("Exclude lines with totals (or dic if -d option is " "used) greater than this parameter")) p.add_option('-s', '--start', action="store", dest='start', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Look for revisions starting from this date") p.add_option('-e', '--end', action="store", dest='end', type="yyyymmdd", metavar="YYYYMMDD", default=None, help="Look for revisions until this date") p.add_option('-d', '--dic', action="store_true", dest="dic", default=False, help="Calculate percentage over dic column instead of total") p.add_option('-n', '--namespaces', action="store", dest="namespaces", help="Output only selected namespaces (comma separated)") opts, files = p.parse_args() if len(files) != 2: p.error("Wrong parameters") if opts.verbose: logging.basicConfig(stream=sys.stderr, level=logging.DEBUG, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S') csv_reader = csv.reader(open(files[0]), delimiter="\t") onlycols = None ignorecols = None if opts.onlycols: onlycols = [int(x) for x in opts.onlycols.split(",")] if opts.ignorecols: ignorecols = [int(x) for x in opts.ignorecols.split(",")] # content contains all the csv file content = [row for row in csv_reader] # columns to skip (namespace, text, total, ...) ns_index = 1 len_line = len(content[0]) to_skip = [ns_index, len_line - 1, len_line - 2, len_line - 3, opts.id_col] # CSV header, only of interesting columns header = [x for x in _gen_data(content[0], to_skip, ignorecols, onlycols)] namespaces = {} for row in content[1:]: try: namespaces[row[ns_index]].append(row) except KeyError: namespaces[row[ns_index]] = [row] pdf_pag = PdfPages(files[1]) opts.namespaces = opts.namespaces.split(",") if opts.namespaces else None for ns in namespaces: if (not opts.namespaces) or (ns in opts.namespaces): logging.info("Processing namespace: %s", ns) # Creates a matrix (list) with percentages of the occurrencies of every # category. Don't count id, total, text, ignore columns. If onlycols is set # consider only them. mat = [] timestamps = [] totals = [] tot_index = -3 if opts.dic: tot_index = -5 for line in namespaces[ns]: #filter only pages with total (or dic is -d) greater or smaller than X if opts.excludemorethan: if float(line[tot_index]) > opts.excludemorethan: continue if opts.excludelessthan: if float(line[tot_index]) < opts.excludelessthan: continue mat.append([ x for x in _gen_data(line, to_skip, ignorecols, onlycols) ]) totals.append(float(line[tot_index])) timestamps.append(dt.strptime(line[opts.id_col], "%Y/%m/%d")) mat = np.array(mat, dtype=np.float).transpose() logging.info("Input file read. Ready to plot") with Timr("Plotting"): for i, series in enumerate(mat): logging.info("Plotting page %d", i + 1) # Don't plot zeros and skip zero revisions! #ser = [x for x in series if x != 0] #time = [x for k, x in enumerate(timestamps) if series[k] != 0] #tot = [x for k, x in enumerate(totals) if series[k] != 0] ser = [x for k, x in enumerate(series) \ if (not opts.start or timestamps[k] >= opts.start) and \ (not opts.end or timestamps[k] <= opts.end)] time = [x for k, x in enumerate(timestamps) \ if (not opts.start or x >= opts.start) and \ (not opts.end or x <= opts.end)] tot = [x for k, x in enumerate(totals) \ if (not opts.start or timestamps[k] >= opts.start) and \ (not opts.end or timestamps[k] <= opts.end)] if opts.smooth and len(time) and len(ser) and len(tot): time, ser, tot = smooth_values(time, ser, tot, opts.smooth) if opts.window and len(time) and len(ser) and len(tot): time, ser, tot = collapse_values( time, ser, tot, opts.window) if opts.group and len(time) and len(ser) and len(tot): time, ser, tot = group_values(time, ser, tot, opts.group) try: mean = float(sum(series)) / len(series) except ZeroDivisionError: continue #rel_mean is the mean for the period [opts.end, opts.start] try: rel_mean = float(sum(ser)) / len(ser) except ZeroDivisionError: continue if opts.perc: try: mean = float(sum(series)) / sum(totals) rel_mean = float(sum(ser)) / sum(tot) except ZeroDivisionError: mean = 0 rel_mean = 0 # Calculate percentages ser = [calc_perc(x, tot[k]) for k, x in enumerate(ser)] # Set axis limit 0-1 IS IT GOOD OR BAD? #axis.set_ylim(0, 1) plt.ylabel("%") first_time = time[0].date() last_time = time[-1].date() plt.clf() plt.subplots_adjust(bottom=0.25) plt.xticks(rotation=90) fig = plt.gcf() fig.set_size_inches(11.7, 8.3) axis = plt.gca() axis.xaxis.set_major_formatter( md.DateFormatter('%Y-%m-%d')) axis.set_xlim(matplotlib.dates.date2num(first_time), matplotlib.dates.date2num(last_time)) if last_time - first_time < timedelta(days=30): axis.xaxis.set_major_locator(md.DayLocator(interval=1)) axis.xaxis.set_minor_locator(md.DayLocator(interval=1)) else: axis.xaxis.set_minor_locator( md.MonthLocator(interval=1)) rule = md.rrulewrapper(md.MONTHLY, interval=4) auto_loc = md.RRuleLocator(rule) axis.xaxis.set_major_locator(auto_loc) axis.tick_params(labelsize='x-small') plt.xlabel("Revisions Timestamp") if len(time) and len(ser): if opts.window: time = [t.date() for t in time] logging.info("Mean: %f", mean) logging.info("Relative Mean: %f", rel_mean) plt.plot(matplotlib.dates.date2num(time), ser, "b.-") plt.axhline(y=mean, color="r") plt.title("%s - %s - Mean: %.5f - Relative mean: %.5f" % \ (ns, header[i], round(mean, 5), round(rel_mean, 5))) pdf_pag.savefig() pdf_pag.close()
def get_network(self): with Timr('Flushing'): self.ecache.flush() return self.ecache.get_network(edge_label='timestamp')