Exemplo n.º 1
0
def main():
    logging.basicConfig(  #filename="graph_longiudinal_analysis.log",
        stream=sys.stderr,
        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    op = create_option_parser()
    args = op.parse_args()

    ## explode dump filename in order to obtain wiki lang, dump date and type
    _, date_, _ = mwlib.explode_dump_filename(args.file_name)

    fn, start, tw = args.file_name, args.start, args.time_window
    ## if end argument is not specified, then use the dump date
    end = args.end if args.end else lib.yyyymmdd_to_datetime(date_)
    ## frequency not to be considered in case of cumulative analysis
    freq = args.frequency if (args.frequency and not args.cumulative) else tw

    if args.cumulative:
        logging.info("Cumulative longitudinal analysis chosen,"
                     "hence not considering following option: frequency")

    with Timr("RUNNING ANALYSIS"):
        if args.cumulative:
            cumulative_analysis(fn, start, end, freq)
        else:
            time_slice_analysis(fn, start, end, freq, tw)
Exemplo n.º 2
0
    def setUp(self):
        xml = "tests/utpedits2graph/" + \
              "vecwiki-20100307-stub-meta-history-TEST.xml.bz2"
        self.lang, self.date_, self.type_ = mwlib.explode_dump_filename(xml)

        deflate, _lineno = find_open_for_this_file(xml)
        welcome = defaultdict(str)
        welcome.update({'it': r'Benvenut',
                        'en': r'Welcome'})
        if _lineno:
            src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
        else:
            src = deflate(xml)
        tag = mwlib.get_tags(src,
                        tags='page,title,revision,timestamp,contributor,'
                                  'username,ip,comment,id')
        translations = mwlib.get_translations(src)

        try:
            lang_user = unicode(translations['User'])
            lang_user_talk = unicode(translations['User talk'])
        except UnicodeDecodeError:
            lang_user = smart_str(translations['User'])
            lang_user_talk = smart_str(translations['User talk'])
        src.close()
        src = deflate(xml)
        self.processor = HistoryPageProcessor(tag=tag,
                         user_talk_names=(lang_user_talk, u"User talk"))
        self.processor.welcome_pattern = welcome[self.lang]
        self.processor.start(src)
        self.g = self.processor.get_network()
Exemplo n.º 3
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t', '--type', action="store", dest="type", default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang,
                                              output=out,
                                              userns=translation['User'])
    processor.talkns = translation['Talk']
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr('Processing'):
        processor.start(src) ## PROCESSING
    processor.flush()
    out.close()
def main():

    logging.basicConfig(#filename="graph_longiudinal_analysis.log",
                                stream=sys.stderr,
                                level=logging.DEBUG)
    logging.info('---------------------START---------------------')
    
    op = create_option_parser()
    args = op.parse_args()
    
    ## explode dump filename in order to obtain wiki lang, dump date and type
    lang, date_, type_ = mwlib.explode_dump_filename(args.file_name)
                    
    fn, start, tw = args.file_name, args.start, args.time_window
    ## if end argument is not specified, then use the dump date
    end = args.end if args.end else lib.yyyymmdd_to_datetime(date_)
    ## frequency not to be considered in case of cumulative analysis
    freq = args.frequency if (args.frequency and not args.cumulative) else tw
    
    if args.cumulative:
        logging.info("Cumulative longitudinal analysis chosen, hence not considering following option: frequency")

    with Timr("RUNNING ANALYSIS"):
        if args.cumulative:
            cumulative_analysis(fn, start, end, freq)
        else:
            time_slice_analysis(fn, start, end, freq, tw)
Exemplo n.º 5
0
    def setUp(self):
        xml = "tests/utpedits2graph/" + \
              "vecwiki-20100307-stub-meta-history-TEST.xml.bz2"
        self.lang, self.date_, self.type_ = mwlib.explode_dump_filename(xml)

        deflate, _lineno = find_open_for_this_file(xml)
        welcome = defaultdict(str)
        welcome.update({'it': r'Benvenut', 'en': r'Welcome'})
        if _lineno:
            src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
        else:
            src = deflate(xml)
        tag = mwlib.get_tags(src,
                             tags='page,title,revision,timestamp,contributor,'
                             'username,ip,comment,id')
        translations = mwlib.get_translations(src)

        try:
            lang_user = unicode(translations['User'])
            lang_user_talk = unicode(translations['User talk'])
        except UnicodeDecodeError:
            lang_user = smart_str(translations['User'])
            lang_user_talk = smart_str(translations['User talk'])
        src.close()
        src = deflate(xml)
        self.processor = HistoryPageProcessor(tag=tag,
                                              user_talk_names=(lang_user_talk,
                                                               u"User talk"))
        self.processor.welcome_pattern = welcome[self.lang]
        self.processor.start(src)
        self.g = self.processor.get_network()
Exemplo n.º 6
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file gender_file output_file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-e',
                 '--min-edits',
                 default=0,
                 dest="min_edits",
                 metavar="MIN_EDITS",
                 type=int,
                 help="pages with less than MIN_EIDTS edits "
                 "are skipped (default: %(default)s)")

    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    gender_data = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags="page,redirect,timestamp,ip,"
                   "contributor,title,username")
    src.close()
    src = deflate(xml)

    out = open(output, "w")
    processor = GenderPageProcessor(tag=tag,
                                    lang=lang,
                                    output=out,
                                    userns=translation['User'],
                                    gender_data=gender_data,
                                    min_edits=opts.min_edits)
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Exemplo n.º 7
0
def main():
    from bz2 import BZ2File
    from csv import DictWriter

    logging.basicConfig(#filename="usercontributions_export.log",
                        stream=sys.stderr,
                        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    op = create_option_parser()
    args = op.parse_args()

    xml, out, threshold = args.dump, args.out, args.threshold

    lang, date_, _ = mwlib.explode_dump_filename(xml)
    deflate, _lineno = find_open_for_this_file(xml)

    date_ = yyyymmdd_to_datetime(date_, 1)

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tmp = ["Normal"]+[v for _, (_, v) in enumerate(mwlib.get_namespaces(src))]
    namespaces = []
    # fix for quartiles
    for ns in tmp:
        for n in range(1, 5):
            namespaces.append("%s_%d" % (ns, n))
    print namespaces

    fout = BZ2File(out, 'w')

    fields = ['username', 'normal_edits', 'comments_count', 'comments_avg',
              'minor', 'revert', 'npov', 'welcome', 'please', 'thanks',
              'first_edit', 'last_edit', 'tot_edits', 'active_days',
              'days_since_first_edit', 'left_since', 'diversity_score',
              'first_edit_year', 'first_edit_month', 'first_edit_day',
              'last_edit_year', 'last_edit_month', 'last_edit_day', ]
    fields[2:2] = namespaces
    dw = DictWriter(fout, fields)
    dw.writeheader()

    ## to get only the first 1000 users:
    #from itertools import islice
    #data_iterator = islice(prepare_data(namespaces), 1000)
    data_iterator = prepare_data(namespaces, lang, date_, threshold)

    count = 0
    for user in data_iterator:
        for k, v in user.iteritems():
            if type(v) in [int, float]:
                assert v >= 0, "%s is negative" % (k,)
        dw.writerow(user)

        count += 1
        if not count % 5000:
            logging.info(count)
Exemplo n.º 8
0
def main():

    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)  # filename="graph_longiudinal_analysis.log",
    logging.info("---------------------START---------------------")

    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({"it": r"Benvenut", "en": r"Welcome"})

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src, tags="page,title,revision,timestamp,contributor,username,ip,comment")

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations["User"])
        lang_user_talk = unicode(translations["User talk"])
    except UnicodeDecodeError:
        lang_user = smart_str(translations["User"])
        lang_user_talk = smart_str(translations["User talk"])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag, user_talk_names=(lang_user_talk, u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr("Processing"):
        processor.start(src)  ## PROCESSING

    with Timr("Getting network"):
        g = processor.get_network()

    logging.info("Nodes: %d" % len(g.vs))
    logging.info("Edges: %d" % len(g.es))

    with Timr("Saving graph"):
        save_graph(g, lang, type_, date_)
Exemplo n.º 9
0
def main():
    import optparse
    import csv

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging
        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [
            l[0].decode('latin-1') for l in csv.reader(f)
            if l and not l[0][0] == '#'
        ]

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired(desired_pages)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()
Exemplo n.º 10
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file gender_file output_file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-e', '--min-edits', default=0, dest="min_edits",
                 metavar="MIN_EDITS", type=int,
                 help="pages with less than MIN_EIDTS edits "
                      "are skipped (default: %(default)s)")

    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    gender_data = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags="page,redirect,timestamp,ip,"
                        "contributor,title,username")
    src.close()
    src = deflate(xml)

    out = open(output, "w")
    processor = GenderPageProcessor(tag=tag, lang=lang,
                                    output=out,
                                    userns=translation['User'],
                                    gender_data=gender_data,
                                    min_edits=opts.min_edits
                                   )
    with Timr('Processing'):
        processor.start(src) ## PROCESSING
    processor.flush()
    out.close()
Exemplo n.º 11
0
def main():
    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(
        src,
        tags='page,title,revision,timestamp,contributor,username,ip,comment')

    translations = mwlib.get_translations(src)
    lang_user = unicode(translations['User'])
    lang_user_talk = unicode(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    print >> sys.stderr, "BEGIN PARSING"
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
                                     user_talk_names=(lang_user_talk,
                                                      u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'
    with Timr('Processing'):
        processor.start(src)  ## PROCESSING

    with Timr('EdgeCache.get_network()'):
        g = processor.get_network()

    print >> sys.stderr, "Nodes:", len(g.vs)
    print >> sys.stderr, "Edges:", len(g.es)

    for e in g.es:
        e['weight'] = len(e['timestamp'])
        #e['timestamp'] = str(e['timestamp'])
    with Timr('Pickling'):
        g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
def main():
    import optparse
    import csv

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [l[0].decode('latin-1') for l in csv.reader(f)
                                        if l and not l[0][0] == '#']

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired(desired_pages)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()
Exemplo n.º 13
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import logging
        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if not files:
        p.error("Give me a file, please ;-)")

    xml, desired_pages_fn, desired_words_fn = files[0:3]
    threshold = float(files[3])

    desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)]

    lang, _, _ = explode_dump_filename(xml)

    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,text')

    src.close()
    src = deflate(xml)

    processor = HistoryWordsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn)
    processor.words = desired_words

    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
def main():
    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
        tags='page,title,revision,timestamp,contributor,username,ip,comment')

    translations = mwlib.get_translations(src)
    lang_user = unicode(translations['User'])
    lang_user_talk = unicode(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    print >>sys.stderr, "BEGIN PARSING"
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
        user_talk_names=(lang_user_talk, u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'
    with Timr('Processing'):
        processor.start(src) ## PROCESSING

    with Timr('EdgeCache.get_network()'):
        g = processor.get_network()

    print >>sys.stderr, "Nodes:", len(g.vs)
    print >>sys.stderr, "Edges:", len(g.es)

    for e in g.es:
        e['weight'] = len(e['timestamp'])
        #e['timestamp'] = str(e['timestamp'])
    with Timr('Pickling'):
        g.write("%swiki-%s%s.pickle" % (lang, date_, type_), format="pickle")
Exemplo n.º 15
0
def main():
    logging.basicConfig(  #filename="usercontributions.log",
        stream=sys.stderr,
        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    receiver, sender = Pipe(duplex=False)

    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, _, _ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
        tags='page,title,revision,timestamp,contributor,username,ip'+ \
             ',comment,id,minor')

    namespaces = [(0, "Normal")] + mwlib.get_namespaces(src)

    src.close()
    logging.info("BEGIN PARSING")
    src = deflate(xml)

    processor = UserContributionsPageProcessor(tag=tag, lang=lang)
    processor.sender = sender
    processor.namespaces = namespaces
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'

    p = Process(target=use_contrib_dict,
                args=(receiver, processor.namespaces, lang))
    p.start()

    with Timr('PROCESSING'):
        processor.start(src)  ## PROCESSING

    sender.send(None)
    p.join()  ## wait until save is complete
Exemplo n.º 16
0
def main():
    logging.basicConfig(#filename="usercontributions.log",
                        stream=sys.stderr,
                        level=logging.DEBUG)
    logging.info('---------------------START---------------------')

    receiver, sender = Pipe(duplex=False)

    opts, args = opt_parse()
    xml = args[0]

    ## SET UP FOR PROCESSING
    lang, _, _ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
        tags='page,title,revision,timestamp,contributor,username,ip'+ \
             ',comment,id,minor')

    namespaces = [(0, "Normal")]+mwlib.get_namespaces(src)

    src.close()
    logging.info("BEGIN PARSING")
    src = deflate(xml)

    processor = UserContributionsPageProcessor(tag=tag, lang=lang)
    processor.sender = sender
    processor.namespaces = namespaces
    processor.time_end = opts.end
    ##TODO: only works on it.wikipedia.org! :-)
    processor.welcome_pattern = r'Benvenut'

    p = Process(target=use_contrib_dict, args=(receiver, processor.namespaces,
                                               lang))
    p.start()

    with Timr('PROCESSING'):
        processor.start(src) ## PROCESSING

    sender.send(None)
    p.join() ## wait until save is complete
Exemplo n.º 17
0
def main():
    import optparse
    import csv

    p = optparse.OptionParser(usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option("-v", action="store_true", dest="verbose", default=False, help="Verbose output (like timings)")
    p.add_option(
        "-e", "--encoding", action="store", dest="encoding", default="latin-1", help="encoding of the desired_list file"
    )
    opts, files = p.parse_args()
    if opts.verbose:
        import sys, logging

        logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags="page,title,revision," + "minor,timestamp,redirect,ip,username")

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation["Talk"]
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr("Retrieving bots"):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr("Parsing"):
        processor.start(src)
    processor.flush()
Exemplo n.º 18
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    opts, files = p.parse_args()
    if opts.verbose:
        import logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG)

    if not files:
        p.error("Give me a file, please ;-)")

    xml, desired_pages_fn, desired_words_fn = files[0:3]
    threshold = float(files[3])

    desired_words = [w.lower() for w in get_lines_in_list(desired_words_fn)]

    lang, _, _ = explode_dump_filename(xml)

    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,'+ \
                  'minor,timestamp,redirect,text')

    src.close()
    src = deflate(xml)

    processor = HistoryWordsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.set_desired_from_csv(desired_pages_fn)
    processor.words = desired_words

    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
Exemplo n.º 19
0
def main():

    logging.basicConfig(#filename="random_page_extractor.log",
                                stream=sys.stderr,
                                level=logging.DEBUG)

    op = create_option_parser()
    args = op.parse_args()

    with open(args.desired_pages_fn, 'rb') as f:
        desired_pages = [l[0].decode('latin-1') for l in csv.reader(f)
                                        if l and not l[0][0] == '#']

    lang, date_, type_ = explode_dump_filename(args.xml_fn)
    deflate, _lineno = lib.find_open_for_this_file(args.xml_fn)

    dumps_checker(args, type_)

    logging.info('---------------------START---------------------')

    if _lineno:
        src = deflate(args.xml_fn, 51)
    else:
        src = deflate(args.xml_fn)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp')

    src.close()
    src = deflate(args.xml_fn)

    output = open(args.output, 'w') if args.output else None

    processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang,
                                              output=output,
                                              threshold=args.ratio,
                                              min_text=args.min_text_length,
                                              n_users=args.editors_number,
                                              start_revision=args.initial_revision)
    
    processor.talkns = translation['Talk']
    processor.desired_page_type = args.type
    processor.set_desired(desired_pages)
    with Timr('processing'):
        processor.start(src)
Exemplo n.º 20
0
def main():
    import optparse

    p = optparse.OptionParser(usage="usage: %prog [-s SOURCE] [-h] file")
    p.add_option('-s', '--source', metavar='SOURCE', dest='source',
                 help='Specify a graph to use as source for attributes '+ \
                 '(this will disable API calls)')

    opts, files = p.parse_args()

    if not files:
        p.error("Give me a file, please ;-)")
    fn = files[0]

    lang, date, type_ = explode_dump_filename(fn)

    groups = ('bot', 'sysop', 'bureaucrat', 'checkuser', 'steward', 'import',
              'transwiki', 'uploader', 'ipblock-exempt', 'oversight',
              'founder', 'rollbacker', 'accountcreator', 'autoreviewer',
              'abusefilter')
    g = ig.load(fn)
    if opts.source:
        sourceg = ig.load(opts.source)
        for destv in g.vs:
            try:
                sourcev = sourceg.vs.select(username=destv['username'])[0]
            except IndexError:
                print destv['username'], 'not found in source'
                for group in groups:
                    destv[group] = None
                continue
            for group in groups:
                destv[group] = sourcev[group]

    else:
        for group in groups:
            addGroupAttribute(g, lang, group)

        print 'BLOCKED ACCOUNTS'
        addBlockedAttribute(g, lang)

    print 'ANONYMOUS USERS'
    g.vs['anonymous'] = map(isip, g.vs['username'])
    g.write("%swiki-%s%s_rich.pickle" % (lang, date, type_), format="pickle")
Exemplo n.º 21
0
def main():
    import optparse

    p = optparse.OptionParser(usage="usage: %prog [-s SOURCE] [-h] file")
    p.add_option('-s', '--source', metavar='SOURCE', dest='source',
                 help='Specify a graph to use as source for attributes '+ \
                 '(this will disable API calls)')

    opts, files = p.parse_args()

    if not files:
        p.error("Give me a file, please ;-)")
    fn = files[0]

    lang, date, type_ = explode_dump_filename(fn)

    groups = ('bot', 'sysop', 'bureaucrat', 'checkuser', 'steward', 'import',
              'transwiki', 'uploader', 'ipblock-exempt', 'oversight',
              'founder', 'rollbacker', 'accountcreator', 'autoreviewer',
              'abusefilter')
    g = ig.load(fn)
    if opts.source:
        sourceg = ig.load(opts.source)
        for destv in g.vs:
            try:
                sourcev = sourceg.vs.select(username=destv['username'])[0]
            except IndexError:
                print destv['username'], 'not found in source'
                for group in groups:
                    destv[group] = None
                continue
            for group in groups:
                destv[group] = sourcev[group]

    else:
        for group in groups:
            addGroupAttribute(g, lang, group)

        print 'BLOCKED ACCOUNTS'
        addBlockedAttribute(g, lang)

    print 'ANONYMOUS USERS'
    g.vs['anonymous'] = map(isip, g.vs['username'])
    g.write("%swiki-%s%s_rich.pickle" % (lang, date, type_), format="pickle")
Exemplo n.º 22
0
def main():

    logging.basicConfig(#filename="random_page_extractor.log",
                                stream=sys.stderr,
                                level=logging.DEBUG)

    op = create_option_parser()
    args = op.parse_args()

    lang, date_, type_ = explode_dump_filename(args.xml_fn)
    deflate, _lineno = lib.find_open_for_this_file(args.xml_fn)

    dumps_checker(args, type_)

    logging.info('---------------------START---------------------')

    if _lineno:
        src = deflate(args.xml_fn, 51)
    else:
        src = deflate(args.xml_fn)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,redirect,text,username,ip,timestamp')

    src.close()
    src = deflate(args.xml_fn)

    output = open(args.output, 'w') if args.output else None

    processor = HistoryRevisionsPageProcessor(
                    tag=tag,
                    lang=lang,
                    output=output,
                    threshold=args.ratio,
                    min_text=args.min_text_length,
                    min_revisions=args.revisions_number,
                    n_users=args.editors_number,
                    start_revision=args.initial_revision)

    processor.talkns = translation['Talk']
    processor.desired_page_type = args.type
    processor.set_desired_from_csv(desired_pages_fn, encoding=args.encoding)
    with Timr('processing'):
        processor.start(src)
Exemplo n.º 23
0
def main():
    import optparse

    p = optparse.OptionParser(usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option("-v", action="store_true", dest="verbose", default=False, help="Verbose output (like timings)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(
            stream=sys.stderr,
            level=logging.DEBUG,
            format="%(asctime)s %(levelname)s %(message)s",
            datefmt="%Y-%m-%d %H:%M:%S",
        )

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags="page,redirect,timestamp,ip,revision,title")
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag, lang=lang, output=output, userns=translation["User"], geoip=geoip_db)
    with Timr("Processing"):
        processor.start(src)  ## PROCESSING
    processor.flush()
Exemplo n.º 24
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t', '--type', action="store", dest="type", default="all",
                 help="Type of page to analize (content|talk|all)")
    opts, files = p.parse_args()
    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    with open(desired_pages_fn, 'rb') as f:
        desired_pages = [l[0].decode('latin-1') for l in csv.reader(f)
                                        if l and not l[0][0] == '#']
    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    processor = HistoryRevisionsPageProcessor(tag=tag, lang=lang,
                                              output=output)
    processor.talkns = translation['Talk']
    processor.desired_page_type = opts.type
    processor.set_desired(desired_pages)
    processor.start(src)
    processor.flush()
Exemplo n.º 25
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] dump enriched_pickle"
    )

    _, args = p.parse_args()

    if len(args) != 2:
        p.error("Too few or too many arguments")
    xml, rich_fn = args

    global lang_user_talk, lang_user, tag, user_classes
    ## pipe to send data to the  subprocess
    p_receiver, p_sender = Pipe(duplex=False)
    ## pipe to get elaborated data from the subprocess
    done_p_receiver, done_p_sender = Pipe(duplex=False)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)
    lang, date, _ = mwlib.explode_dump_filename(xml)
    g = sg_load(rich_fn)
    user_classes = dict(g.get_user_class('username',
                                  ('anonymous', 'bot', 'bureaucrat', 'sysop')))

    p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender))
    p.start()

    translations = mwlib.get_translations(src)
    lang_user, lang_user_talk = translations['User'], translations['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    ## open with a faster decompressor (but that probably cannot seek)
    src.close()
    src = lib.BZ2FileExt(xml, parallel=False)

    partial_process_page = partial(process_page, send=p_sender)
    mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                    partial_process_page)
    logging.info('Users missing in the rich file: %d', count_missing)

    p_sender.send(0)  # this STOPS the process

    print >> sys.stderr, "end of parsing"

    ## SAVE DATA
    g.set_weighted_degree()
    users_cache = {}
    # get a list of pair (class name, frequency distributions)
    for cls, fd in done_p_receiver.recv():
        with open("%swiki-%s-words-%s.dat" %
                  (lang, date,
                   cls.replace(' ', '_')), 'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in fd:
                print >> out, v, k
        del fd

    for cls, counters in done_p_receiver.recv():
        with open("%swiki-%s-smile-%s.dat" %
                  (lang, date,
                   cls.replace(' ', '_')), 'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in counters:
                print >> out, v, k
        del counters

    p.join()

    print >> sys.stderr, "end of FreqDist"
Exemplo n.º 26
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-p',
                 '--per-page',
                 action="store",
                 dest="per_page_stats",
                 help="Per page stats output")
    p.add_option('-e',
                 '--min-edits',
                 action="store",
                 type=int,
                 dest="min_edits",
                 help="Skip if page has less than min-edit edits")
    p.add_option('-a',
                 '--min-anon',
                 action="store",
                 type=int,
                 dest="min_anon",
                 help="Skip if page has less than min-anon anonymous edits")
    p.add_option('-E',
                 '--exclude',
                 action="store",
                 dest="exclude_countries",
                 help="Countries to exclude, colon (;) separated")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,redirect,timestamp,ip,revision,title')
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag,
                                       lang=lang,
                                       output=output,
                                       userns=translation['User'],
                                       geoip=geoip_db)
    if opts.per_page_stats:
        processor.per_page_stats = opts.per_page_stats
    if opts.exclude_countries:
        processor.exclude_countries = opts.exclude_countries.split(";")
    processor.min_edits = opts.min_edits
    processor.min_anon = opts.min_anon
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
Exemplo n.º 27
0
def main():
    op = create_option_parser()

    (options, args) = op.parse_args()

    if len(args) != 1:
        print "Insert one (and only one) file to process\n"
        op.print_help()
        sys.exit(2)

    fn = args[0]
    lang, date, type_ = mwlib.explode_dump_filename(fn)

    g = sg.load(fn)
    g.time_slice_subgraph(start=options.start, end=options.end)
    g.invert_edge_attr('weight', 'length')

    vn = len(g.g.vs) # number of vertexes
    en = len(g.g.es) # number of edges

    timr = Timr()

    if options.as_table:
        tablr = Tablr()
        tablr.start(1024*32, lang)

    if options.group or options.users_role or options.histogram:
        for group_name, group_attr in groups.iteritems():
            g.defineClass(group_name, group_attr)
            print ' * %s : nodes number : %d' % (group_name,
                                                 len(g.classes[group_name]))
    else:
        g.defineClass('all', {})

    print " * filename: %s" % (fn,)
    print " * lang: %s" % (lang,)
    print " * date: %s" % (date,)

    if options.details:
        with Timr("details"):
            print " * nodes number: %d" % (vn,)
            print " * edges number: %d" % (en,)

            nodes_with_outdegree = len(g.g.vs.select(_outdegree_ge=1))
            nodes_with_indegree = len(g.g.vs.select(_indegree_ge=1))
            self_loop_edges = len([edge for edge in g.g.es \
                                   if edge.target == edge.source])

            print " * nodes with out edges number: %d (%6f%%)" % (
                nodes_with_outdegree, 100.*nodes_with_outdegree/vn)
            print " * nodes with in edges number: %d (%6f%%)" % (
                nodes_with_indegree, 100.*nodes_with_indegree/vn)
            print " * max weights on edges : %s" % top(g.g.es['weight'])

            print " * self-loop edges: %d" % self_loop_edges
            #print " * diameter : %6f" % g.g.diameter(weights='length')
            #print " * average weight : %6f" % numpy.average(g.g.es['weight'])


    if options.density or options.reciprocity:
        with Timr('density&reciprocity'):
            for cls, vs in g.classes.iteritems():
                if not len(vs) > 1:
                    continue

                subgraph = vs.subgraph()

                print " * %s : density : %.10f" % (cls, subgraph.density())
                print " * %s : reciprocity : %.10f" % (cls,
                                                       subgraph.reciprocity())


    if options.degree:
        with Timr('degree'):
            g.g.vs['indegree'] = g.g.degree(type=ig.IN)
            g.g.vs['outdegree'] = g.g.degree(type=ig.OUT)

            for cls, vs in g.classes.iteritems():
                if not vs:
                    continue

                ind = numpy.array(vs['indegree'])
                outd = numpy.array(vs['outdegree'])

                print " * %s : mean IN degree (no weights): %f" % (
                    cls, numpy.average(ind))
                print " * %s : mean OUT degree (no weights): %f" % (
                    cls, numpy.average(outd))
                print " * %s : max IN degrees (no weights): %s" % (cls,
                                                                   top(ind))
                print " * %s : max OUT degrees (no weights): %s" % (cls,
                                                                    top(outd))

                print " * %s : stddev IN degree (no weights): %f" % (
                    cls, numpy.sqrt(numpy.var(ind)))
                print " * %s : stddev OUT degree (no weights): %f" % (
                    cls, numpy.sqrt(numpy.var(outd)))

    if options.transitivity:
        ##print " * transitivity: %f" % (nx.transitivity(g), )
        pass

    if options.summary:
        # don't use with --as-table
        print " * summary: %s" % (g.g.summary(), )

    if options.distance:
        with Timr('split clusters'):
            vc = g.g.clusters()
            size_clusters = vc.sizes()
            giant = vc.giant()

            print " * length of 5 max clusters: %s" % top(size_clusters)
            #print " * #node in 5 max clusters/#all nodes: %s" % top(
            #    [1.*cluster_len/vn for cluster_len in size_clusters])


    if options.distance:
        with Timr('distance'):
            gg = sg.Graph(giant)
            print " * average distance in the giant component: %f" % \
                  gg.averageDistance(weight='length')
            print " * average hops in the giant component: %f" % \
                  gg.averageDistance()

            #print "Average distance 2: %f" % giant.average_path_length(True,
            #                                                           False)


    if options.efficiency:
        with Timr('efficiency'):
            print " * efficiency: %f" % g.efficiency(weight='length')


    ##TODO: compute for centrality only if "all" or "degree"
    if (options.plot or options.histogram or options.power_law or
        options.centrality):
        with Timr('set weighted indegree'):
            g.set_weighted_degree()


    if options.centrality:
        timr.start('centrality')
        centralities = options.centrality.split(',')
        if 'all' in centralities:
            centralities = 'betweenness,pagerank,degree'.split(',')

        if set(centralities).difference(
            'betweenness,pagerank,degree'.split(',')):
            logging.error('Unknown centrality')
            sys.exit(0)

        if "betweenness" in centralities:
            print >> sys.stderr, "betweenness"
            g.g.vs['bw'] = g.g.betweenness(weights='length', directed = True)

        #g.g.vs['ev'] = g.g.evcent(weights='weight') # eigenvector centrality

        if 'pagerank' in centralities:
            print >> sys.stderr, "pagerank"
            g.g.vs['pr'] = g.g.pagerank(weights='weight') # pagerank

        if 'degree' in centralities:
            print >> sys.stderr, "outdegree"
            g.set_weighted_degree(type=ig.OUT)
        #total_weights = sum(g.g.es['weight'])
        max_edges = vn*(vn-1)

        for cls, vs in g.classes.iteritems():
            if not vs:
                continue

            if "betweenness" in centralities:
                norm_betweenness = numpy.array(g.classes[cls]['bw'])/max_edges
                print " * %s : average betweenness : %.10f" % (
                    cls, numpy.average(norm_betweenness))
                print " * %s : stddev betweenness : %.10f" % (
                    cls, numpy.sqrt(numpy.var(norm_betweenness)))
                print " * %s : max betweenness: %s" % (
                    cls, top(numpy.array(g.classes[cls]['bw'])/max_edges))

            #print " * Average eigenvector centrality : %6f" % numpy.average(
            #    g.vs['ev'])
            if 'pagerank' in centralities:
                print " * %s : average pagerank : %.10f" % (
                    cls, numpy.average(g.classes[cls]['pr']))
                print " * %s : stddev pagerank : %.10f" % (
                    cls, numpy.sqrt(numpy.var(g.classes[cls]['pr'])))
                print " * %s : max pagerank: %s" % (
                    cls, top(g.classes[cls]['pr']))

            if 'degree' in centralities:
                wi = g.classes[cls]['weighted_indegree']
                print " * %s : average IN degree centrality (weighted): %.10f" % (
                    cls, numpy.average(wi))
                print " * %s : stddev IN degree centrality (weighted): %.10f" % (
                    cls, numpy.sqrt(numpy.var(wi)))
                print " * %s : max IN degrees centrality (weighted): %s" % (
                    cls, top(wi))
                del wi

                wo = g.classes[cls]['weighted_outdegree']
                print " * %s : average OUT degree centrality (weighted) : %.10f" %\
                      (cls, numpy.average(wo))
                print " * %s : stddev OUT degree centrality (weighted) : %.10f" % \
                      (cls, numpy.sqrt(numpy.var(wo)))
                print " * %s : max OUT degrees centrality (weighted): %s" % (
                    cls, top(wo))
                del wo

        timr.stop('centrality')

    if options.power_law:
        with Timr('power law'):
            for cls, vs in g.classes.iteritems():
                if not vs:
                    continue

                indegrees = vs['weighted_indegree']

                try:
                    alpha_exp = ig.statistics.power_law_fit(indegrees, xmin=6)
                    print " * %s : alpha exp IN degree distribution : %10f " %\
                          (cls, alpha_exp)
                except ValueError:
                    print >> sys.stderr,\
                          " * %s : alpha exp IN degree distribution : ERROR" %\
                          (cls,)

    if options.histogram:
        list_with_index = lambda degrees, idx: [(degree, idx) for degree
                                                in degrees if degree]
        all_list = []

        nogrp_indegrees = g.g.vs.select(sysop_ne=True, bureaucrat_ne=True,
                                        steward_ne=True, founder_ne=True,
                                        bot_ne=True)['weighted_indegree']
        all_list += list_with_index(nogrp_indegrees, 1)

        sysops_indegrees = g.classes['sysop']['weighted_indegree']
        all_list += list_with_index(sysops_indegrees, 2)

        burs_indegrees = g.classes['bureaucrat']['weighted_indegree']
        all_list += list_with_index(burs_indegrees, 3)

        stewards_indegrees = g.classes['steward']['weighted_indegree']
        all_list += list_with_index(stewards_indegrees, 4)

        founders_indegrees = g.classes['founder']['weighted_indegree']
        all_list += list_with_index(founders_indegrees, 5)

        bots_indegrees = g.classes['bot']['weighted_indegree']
        all_list += list_with_index(bots_indegrees, 6)

        if options.gnuplot:
            f = open('hist.dat', 'w')
        else:
            f = open('%swiki-%s-hist.dat' % (lang, date), 'w')

        all_list.sort(reverse=True)

        for indegree, grp in all_list:
            for _ in range(grp - 1):
                print >> f, 0,
            print >> f, indegree,
            for _ in range(grp, 6):
                print >> f, 0,
            print >> f, ""
        f.close()

    if options.gnuplot:
        from popen2 import Popen3

        process = Popen3('gnuplot hist.gnuplot')
        process.wait()

        os.rename('hist.png', '%swiki-%s-hist.png' % (lang, date))
        os.rename('hist.dat', '%swiki-%s-hist.dat' % (lang, date))

    if options.plot:
        ## TODO: evaluate if this can be done with
        ## http://bazaar.launchpad.net/~igraph/igraph/0.6-main/revision/2018
        with Timr('plot'):
            import math

            ## filter:
            #print len(g.g.vs), len(g.g.es)
            #g.set_weighted_degree(type=ig.OUT)
            #g.g = g.g.subgraph(g.g.vs.select(weighted_indegree_ge=10,
            #                           weighted_outdegree_ge=1))
            #g.g.write_graphml('itwiki-20100729-stub-meta-history_in10_out1.graphml')
            #print len(g.g.vs), len(g.g.es)

            bots = g.g.vs.select(bot=True)
            bots['color'] = ('purple',)*len(bots)
            logging.debug('bots: ok')

            anonyms = g.g.vs.select(anonymous=True)
            anonyms['color'] = ('blue',)*len(anonyms)

            sysops = g.g.vs.select(sysop=True)
            sysops['color'] = ('yellow',)*len(sysops)

            bur_sysops = g.g.vs.select(bureaucrat=True, sysop=True)
            bur_sysops['color'] = ('orange',)*len(bur_sysops)

            g.g.vs['size'] = [math.sqrt(v['weighted_indegree']+1)*10 for v
                              in g.g.vs]

            logging.debug('plot: begin')
            ig.plot(g.g, target=lang+"_general.png", bbox=(0, 0, 8000, 8000),
                    edge_color='grey', layout='drl')
            logging.debug('plot: end')
            weights = g.g.es['weight']
            max_weight = max(weights)

            g.g.es['color'] = [(255.*e['weight']/max_weight, 0., 0.) for e
                               in g.g.es]
            g.g.es['width'] = weights

            ig.plot(g.g, target=lang+"_weighted_edges.png", bbox=(0, 0, 4000,
                                                                  2400),
                    layout='fr', vertex_label=' ')


    if options.as_table:
        tablr.stop()

        #tablr.printHeader()
        #tablr.printData()
        tablr.saveInDjangoModel()


    if options.adjacency:
        giant = g.g.clusters().giant()
        #destAdj = "%s/%swiki-%s-adj.csv" % (os.path.split(fn)[0], lang, date)
        destAdj = "%swiki-%s-adj.csv" % (lang, date)
        #destRec = "%s/%swiki-%s-rec.csv" % (os.path.split(fn)[0], lang, date)
        destRec = "%swiki-%s-rec.csv" % (lang, date)
        sg.Graph(giant).writeAdjacencyMatrix(destAdj, 'username')
        sg.Graph(giant).writeReciprocityMatrix('username', destRec)


    if options.users_role:
        l = g.get_user_class('username', ('anonymous', 'bot', 'bureaucrat',
                                        'sysop'))

        #destUR = "%s/%swiki-%s-ur.csv" % (os.path.split(fn)[0], lang, date)
        destUR = "%swiki-%s-ur.csv" % (lang, date)
        with open(destUR, 'w') as f:
            for username, role in sorted(l):
                print >> f, "%s,%s" % (username, role)

        from random import shuffle
        #destCls = "%s/%swiki-%s-%%s.csv" % (os.path.split(fn)[0], lang, date)
        destCls = "%swiki-%s-%%s.csv" % (lang, date)
        for cls in ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal_user'):
            users = g.classes[cls]['username']
            shuffle(users)
            with open(destCls % cls, 'w') as f:
                for username in users:
                    print >> f, \
                          ("%s,http://vec.wikipedia.org/w/index.php?title="+\
                          "Discussion_utente:%s&action=history&offset="+\
                          "20100000000001") % (username, username)
Exemplo n.º 28
0
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
            usage="usage: %prog [options] input_file dictionary output_file",
            option_class=SonetOption
        )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-S', '--detailed-start', action="store",
        dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD",
        default=None, help="Detailed output start date")
    p.add_option('-E', '--detailed-end', action="store",
        dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None,
        help="Detailed output end date")
    p.add_option('-n', '--detailed-namespace', action="store",
                 dest="detailed_ns", default="Normal",
                 help="Namespace of desired detailed data (default: Normal)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,'
                              'contributor,username,ip'))
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    if os.path.exists(output):
        logging.error("File %s already exists!", output)
        sys.exit(0)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean
    if opts.detailed_start and opts.detailed_end:
        print """
        You are going to run the script with detailed output on %d days.
        This is going to produce some CSV files on your disk, one for each
        day. Is this want you really want to do? [press enter to continue]
        """ % (opts.detailed_end - opts.detailed_start).days
        raw_input()
        processor.pywc.detailed = True
        processor.detailed_start = opts.detailed_start
        processor.detailed_end = opts.detailed_end
        processor.detailed_ns = opts.detailed_ns

    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Exemplo n.º 29
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file desired_list output_file")
    p.add_option('-t',
                 '--type',
                 action="store",
                 dest="type",
                 default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e',
                 '--encoding',
                 action="store",
                 dest="encoding",
                 default="latin-1",
                 help="encoding of the desired_list file")
    p.add_option('-v',
                 action="store_true",
                 dest="verbose",
                 default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T',
                 "--timeout",
                 action="store",
                 dest="timeout",
                 type=float,
                 default=0.5,
                 help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c',
                 '--clean',
                 action="store_true",
                 dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    desired_pages_fn = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = HistoryRevisionsPageProcessor(tag=tag,
                                              lang=lang,
                                              output=out,
                                              userns=translation['User'])
    processor.talkns = translation['Talk']
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.set_desired_from_csv(desired_pages_fn, encoding=opts.encoding)
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Exemplo n.º 30
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file dictionary output_file")
    p.add_option('-t', '--type', action="store", dest="type", default="all",
                 help="Type of page to analize (content|talk|all)")
    p.add_option('-e', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-C', '--charlimit', action="store", dest="charlimit",
                 type="int", default=100000,
                 help="Maximim characters per line (default=100000)")
    p.add_option('-r', action="store_true", dest="regex", default=False,
                 help="Use a dictionary composed by regex (default=false)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,timestamp,text,redirect')
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    if opts.type == 'talk':
        processor.get_articles = False
    elif opts.type == 'content':
        processor.get_talks = False
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean

    with Timr('Processing'):
        processor.start(src) ## PROCESSING
    processor.flush()
    out.close()
Exemplo n.º 31
0
def main():
    import optparse

    p = optparse.OptionParser(
        usage="usage: %prog [options] dump enriched_pickle")

    _, args = p.parse_args()

    if len(args) != 2:
        p.error("Too few or too many arguments")
    xml, rich_fn = args

    global lang_user_talk, lang_user, tag, user_classes
    ## pipe to send data to the  subprocess
    p_receiver, p_sender = Pipe(duplex=False)
    ## pipe to get elaborated data from the subprocess
    done_p_receiver, done_p_sender = Pipe(duplex=False)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)
    lang, date, _ = mwlib.explode_dump_filename(xml)
    g = sg_load(rich_fn)
    user_classes = dict(
        g.get_user_class('username',
                         ('anonymous', 'bot', 'bureaucrat', 'sysop')))

    p = Process(target=get_freq_dist, args=(p_receiver, done_p_sender))
    p.start()

    translations = mwlib.get_translations(src)
    lang_user, lang_user_talk = translations['User'], translations['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    ## open with a faster decompressor (but that probably cannot seek)
    src.close()
    src = lib.BZ2FileExt(xml, parallel=False)

    partial_process_page = partial(process_page, send=p_sender)
    mwlib.fast_iter(etree.iterparse(src, tag=tag['page']),
                    partial_process_page)
    logging.info('Users missing in the rich file: %d', count_missing)

    p_sender.send(0)  # this STOPS the process

    print >> sys.stderr, "end of parsing"

    ## SAVE DATA
    g.set_weighted_degree()
    users_cache = {}
    # get a list of pair (class name, frequency distributions)
    for cls, fd in done_p_receiver.recv():
        with open(
                "%swiki-%s-words-%s.dat" % (lang, date, cls.replace(' ', '_')),
                'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in fd:
                print >> out, v, k
        del fd

    for cls, counters in done_p_receiver.recv():
        with open(
                "%swiki-%s-smile-%s.dat" % (lang, date, cls.replace(' ', '_')),
                'w') as out:
            # users in this group
            try:
                users = users_cache[cls]
            except KeyError:
                users = get_class(g, cls)
                users_cache[cls] = users
            print >> out, '#users: ', len(users)
            print >> out, '#msgs: ', sum(users['weighted_indegree'])
            for k, v in counters:
                print >> out, v, k
        del counters

    p.join()

    print >> sys.stderr, "end of FreqDist"
Exemplo n.º 32
0
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
        usage="usage: %prog [options] file desired_list acceptance_ratio",
        option_class=SonetOption
    )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-E', '--encoding', action="store", dest="encoding",
                 default="latin-1", help="encoding of the desired_list file")
    p.add_option('-d', '--delimiter', action="store", dest="delimiter",
                 default=",", help="CSV delimiter")
    p.add_option('-s', '--start', action="store", dest='start',
                 type="yyyymmdd", metavar="YYYYMMDD", default=None,
                 help="Look for revisions starting from this date")
    p.add_option('-e', '--end', action="store", dest='end', type="yyyymmdd",
                 metavar="YYYYMMDD", default=None,
                 help="Look for revisions until this date")

    opts, files = p.parse_args()
    if opts.verbose:
        import sys
        import logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG)

    if len(files) != 3:
        p.error("Wrong parameters")

    xml = files[0]
    desired_pages_fn = files[1]
    threshold = float(files[2])

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags='page,title,revision,' + \
                   'minor,timestamp,redirect,ip,username')

    src.close()
    src = deflate(xml)

    processor = HistoryEventsPageProcessor(tag=tag, lang=lang)
    processor.talkns = translation['Talk']
    processor.threshold = threshold
    processor.start_date = opts.start
    processor.end_date = opts.end
    processor.set_desired_from_csv(desired_pages_fn,
                                   encoding=opts.encoding,
                                   delimiter=opts.delimiter)
    with Timr('Retrieving bots'):
        processor.set_bots()
    print "BEGIN PARSING"
    with Timr('Parsing'):
        processor.start(src)
    processor.flush()
Exemplo n.º 33
0
def main():
    import optparse
    p = optparse.OptionParser(
        usage="usage: %prog [options] input_file geoip_db output_file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-p', '--per-page', action="store",
                 dest="per_page_stats", help="Per page stats output")
    p.add_option('-e', '--min-edits', action="store", type=int,
                 dest="min_edits",
                 help="Skip if page has less than min-edit edits")
    p.add_option('-a', '--min-anon', action="store", type=int,
                 dest="min_anon",
                 help="Skip if page has less than min-anon anonymous edits")
    p.add_option('-E', '--exclude', action="store",
                 dest="exclude_countries",
                 help="Countries to exclude, colon (;) separated")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    geoip_db = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src,
                   tags='page,redirect,timestamp,ip,revision,title')
    src.close()
    src = deflate(xml)

    processor = CountriesPageProcessor(tag=tag, lang=lang,
                                       output=output,
                                       userns=translation['User'],
                                       geoip=geoip_db
                                      )
    if opts.per_page_stats:
        processor.per_page_stats = opts.per_page_stats
    if opts.exclude_countries:
        processor.exclude_countries = opts.exclude_countries.split(";")
    processor.min_edits = opts.min_edits
    processor.min_anon = opts.min_anon
    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
Exemplo n.º 34
0
def main():
    opts, args = opt_parse()
    xml = args[0]
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    logging.info('---------------------START---------------------')

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({'it': r'Benvenut', 'en': r'Welcome'})

    if _lineno:
        src = deflate(xml, 51)  # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
                         tags='page,title,revision,timestamp,contributor,'
                         'username,ip,comment,id')

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations['User'])
        lang_user_talk = unicode(translations['User talk'])
    except UnicodeDecodeError:
        lang_user = smart_str(translations['User'])
        lang_user_talk = smart_str(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
                                     user_talk_names=(lang_user_talk,
                                                      u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr('Processing'):
        processor.start(src)  ## PROCESSING

    with Timr('Getting network'):
        g = processor.get_network()

    logging.info("Nodes: %d", len(g.vs))
    logging.info("Edges: %d", len(g.es))

    with Timr('Saving graph'):
        save_graph(g, lang, type_, date_)
Exemplo n.º 35
0
def main():
    import optparse

    p = optparse.OptionParser(usage="usage: %prog file")
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-s', action="store", dest="signature", default=None,
                 help="Signature in this language (e.g. sig, firma..)")
    opts, files = p.parse_args()
    if opts.verbose:
        import sys
        import logging
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    try:
        xml = files[0]
    except IndexError:
        p.error("Give me one file, please")

    en_user, en_user_talk = u"User", u"User talk"

    lang, date, type_ = mwlib.explode_dump_filename(xml)

    src = BZ2File(xml)

    tag = mwlib.get_tags(src)

    ns_translation = mwlib.get_translations(src)
    lang_user, lang_user_talk = ns_translation['User'], \
             ns_translation['User talk']

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    lang_user = unicode(lang_user, "utf8")
    en_user = unicode(en_user)

    # open dump with an external process to use multiple cores
    _fast = True
    if _fast:
        src.close()
        src = lib.BZ2FileExt(xml)

    if opts.signature is not None:
        processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag,
                              user_talk_names=(lang_user_talk, en_user_talk),
                              search=(lang_user, en_user), lang=lang,
                              signature=opts.signature)
    else:
        processor = CurrentPageProcessor(ecache=EdgeCache(), tag=tag,
                              user_talk_names=(lang_user_talk, en_user_talk),
                              search=(lang_user, en_user), lang=lang)

    with Timr('Processing'):
        processor.start(src)

    with Timr('Create network'):
        g = processor.ecache.get_network()

    logging.info("Len:", len(g.vs))
    logging.info("Edges:", len(g.es))

    g.write("%swiki-%s%s.pickle" % (lang, date, type_), format="pickle")
Exemplo n.º 36
0
def main():
    import optparse
    from sonet.lib import SonetOption

    p = optparse.OptionParser(
            usage="usage: %prog [options] input_file dictionary output_file",
            option_class=SonetOption
        )
    p.add_option('-v', action="store_true", dest="verbose", default=False,
                 help="Verbose output (like timings)")
    p.add_option('-T', "--timeout", action="store", dest="timeout", type=float,
                 default=0.5, help="Diff timeout (default=0.5, 0=no timeout)")
    p.add_option('-c', '--clean', action="store_true", dest="clean",
                 default=False,
                 help="Cleans HTML, wiki syntax, acronyms and emoticons")
    p.add_option('-S', '--detailed-start', action="store",
        dest='detailed_start', type="yyyymmdd", metavar="YYYYMMDD",
        default=None, help="Detailed output start date")
    p.add_option('-E', '--detailed-end', action="store",
        dest='detailed_end', type="yyyymmdd", metavar="YYYYMMDD", default=None,
        help="Detailed output end date")
    p.add_option('-n', '--detailed-namespace', action="store",
                 dest="detailed_ns", default="Normal",
                 help="Namespace of desired detailed data (default: Normal)")
    opts, files = p.parse_args()

    if len(files) != 3:
        p.error("Wrong parameters")
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')

    xml = files[0]
    dic = files[1]
    output = files[2]

    dumps_checker(xml)

    lang, _, _ = explode_dump_filename(xml)
    deflate, _lineno = lib.find_open_for_this_file(xml)

    if _lineno:
        src = deflate(xml, 51)
    else:
        src = deflate(xml)

    translation = get_translations(src)
    tag = get_tags(src, tags=('page,title,revision,timestamp,text,redirect,'
                              'contributor,username,ip'))
    namespaces = [x[1] for x in [(0, "Normal")] + mwlib.get_namespaces(src)]
    src.close()
    src = deflate(xml)

    if os.path.exists(output):
        logging.error("File %s already exists!", output)
        sys.exit(0)

    out = open(output, 'w')
    processor = PyWCProcessor(tag=tag, lang=lang, dic=dic,
                              output=out, userns=translation['User'])
    processor.namespaces = namespaces
    processor.diff_timeout = opts.timeout
    processor.clean = opts.clean
    processor.pywc.clean_wiki = processor.pywc.clean_html = opts.clean
    if opts.detailed_start and opts.detailed_end:
        print """
        You are going to run the script with detailed output on %d days.
        This is going to produce some CSV files on your disk, one for each
        day. Is this want you really want to do? [press enter to continue]
        """ % (opts.detailed_end - opts.detailed_start).days
        raw_input()
        processor.pywc.detailed = True
        processor.detailed_start = opts.detailed_start
        processor.detailed_end = opts.detailed_end
        processor.detailed_ns = opts.detailed_ns

    with Timr('Processing'):
        processor.start(src)  # PROCESSING
    processor.flush()
    out.close()
Exemplo n.º 37
0
def main():
    opts, args = opt_parse()
    xml = args[0]
    if opts.verbose:
        logging.basicConfig(stream=sys.stderr,
                            level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s',
                            datefmt='%Y-%m-%d %H:%M:%S')
    logging.info('---------------------START---------------------')

    ## SET UP FOR PROCESSING
    lang, date_, type_ = mwlib.explode_dump_filename(xml)

    deflate, _lineno = find_open_for_this_file(xml)

    welcome = defaultdict(str)

    welcome.update({'it': r'Benvenut',
                    'en': r'Welcome'})

    if _lineno:
        src = deflate(xml, 51)   # Read first 51 lines to extract namespaces
    else:
        src = deflate(xml)

    tag = mwlib.get_tags(src,
                         tags='page,title,revision,timestamp,contributor,'
                              'username,ip,comment,id')

    translations = mwlib.get_translations(src)

    try:
        lang_user = unicode(translations['User'])
        lang_user_talk = unicode(translations['User talk'])
    except UnicodeDecodeError:
        lang_user = smart_str(translations['User'])
        lang_user_talk = smart_str(translations['User talk'])

    assert lang_user, "User namespace not found"
    assert lang_user_talk, "User Talk namespace not found"

    src.close()
    src = deflate(xml)

    processor = HistoryPageProcessor(tag=tag,
        user_talk_names=(lang_user_talk, u"User talk"))
    processor.time_start = opts.start
    processor.time_end = opts.end
    processor.welcome_pattern = welcome[lang]

    with Timr('Processing'):
        processor.start(src) ## PROCESSING

    with Timr('Getting network'):
        g = processor.get_network()

    logging.info("Nodes: %d", len(g.vs))
    logging.info("Edges: %d", len(g.es))

    with Timr('Saving graph'):
        save_graph(g, lang, type_, date_)
Exemplo n.º 38
0
def main():
    op = create_option_parser()

    (options, args) = op.parse_args()

    if len(args) != 1:
        print "Insert one (and only one) file to process\n"
        op.print_help()
        sys.exit(2)

    fn = args[0]
    lang, date, type_ = mwlib.explode_dump_filename(fn)

    g = sg.load(fn)
    g.time_slice_subgraph(start=options.start, end=options.end)
    g.invert_edge_attr('weight', 'length')

    vn = len(g.g.vs)  # number of vertexes
    en = len(g.g.es)  # number of edges

    timr = Timr()

    if options.as_table:
        tablr = Tablr()
        tablr.start(1024 * 32, lang)

    if options.group or options.users_role or options.histogram:
        for group_name, group_attr in groups.iteritems():
            g.defineClass(group_name, group_attr)
            print ' * %s : nodes number : %d' % (group_name,
                                                 len(g.classes[group_name]))
    else:
        g.defineClass('all', {})

    print " * filename: %s" % (fn, )
    print " * lang: %s" % (lang, )
    print " * date: %s" % (date, )

    if options.details:
        with Timr("details"):
            print " * nodes number: %d" % (vn, )
            print " * edges number: %d" % (en, )

            nodes_with_outdegree = len(g.g.vs.select(_outdegree_ge=1))
            nodes_with_indegree = len(g.g.vs.select(_indegree_ge=1))
            self_loop_edges = len([edge for edge in g.g.es \
                                   if edge.target == edge.source])

            print " * nodes with out edges number: %d (%6f%%)" % (
                nodes_with_outdegree, 100. * nodes_with_outdegree / vn)
            print " * nodes with in edges number: %d (%6f%%)" % (
                nodes_with_indegree, 100. * nodes_with_indegree / vn)
            print " * max weights on edges : %s" % top(g.g.es['weight'])

            print " * self-loop edges: %d" % self_loop_edges
            #print " * diameter : %6f" % g.g.diameter(weights='length')
            #print " * average weight : %6f" % numpy.average(g.g.es['weight'])

    if options.density or options.reciprocity:
        with Timr('density&reciprocity'):
            for cls, vs in g.classes.iteritems():
                if not len(vs) > 1:
                    continue

                subgraph = vs.subgraph()

                print " * %s : density : %.10f" % (cls, subgraph.density())
                print " * %s : reciprocity : %.10f" % (cls,
                                                       subgraph.reciprocity())

    if options.degree:
        with Timr('degree'):
            g.g.vs['indegree'] = g.g.degree(type=ig.IN)
            g.g.vs['outdegree'] = g.g.degree(type=ig.OUT)

            for cls, vs in g.classes.iteritems():
                if not vs:
                    continue

                ind = numpy.array(vs['indegree'])
                outd = numpy.array(vs['outdegree'])

                print " * %s : mean IN degree (no weights): %f" % (
                    cls, numpy.average(ind))
                print " * %s : mean OUT degree (no weights): %f" % (
                    cls, numpy.average(outd))
                print " * %s : max IN degrees (no weights): %s" % (cls,
                                                                   top(ind))
                print " * %s : max OUT degrees (no weights): %s" % (cls,
                                                                    top(outd))

                print " * %s : stddev IN degree (no weights): %f" % (
                    cls, numpy.sqrt(numpy.var(ind)))
                print " * %s : stddev OUT degree (no weights): %f" % (
                    cls, numpy.sqrt(numpy.var(outd)))

    if options.transitivity:
        ##print " * transitivity: %f" % (nx.transitivity(g), )
        pass

    if options.summary:
        # don't use with --as-table
        print " * summary: %s" % (g.g.summary(), )

    if options.distance:
        with Timr('split clusters'):
            vc = g.g.clusters()
            size_clusters = vc.sizes()
            giant = vc.giant()

            print " * length of 5 max clusters: %s" % top(size_clusters)
            #print " * #node in 5 max clusters/#all nodes: %s" % top(
            #    [1.*cluster_len/vn for cluster_len in size_clusters])

    if options.distance:
        with Timr('distance'):
            gg = sg.Graph(giant)
            print " * average distance in the giant component: %f" % \
                  gg.averageDistance(weight='length')
            print " * average hops in the giant component: %f" % \
                  gg.averageDistance()

            #print "Average distance 2: %f" % giant.average_path_length(True,
            #                                                           False)

    if options.efficiency:
        with Timr('efficiency'):
            print " * efficiency: %f" % g.efficiency(weight='length')

    ##TODO: compute for centrality only if "all" or "degree"
    if (options.plot or options.histogram or options.power_law
            or options.centrality):
        with Timr('set weighted indegree'):
            g.set_weighted_degree()

    if options.centrality:
        timr.start('centrality')
        centralities = options.centrality.split(',')
        if 'all' in centralities:
            centralities = 'betweenness,pagerank,degree'.split(',')

        if set(centralities).difference(
                'betweenness,pagerank,degree'.split(',')):
            logging.error('Unknown centrality')
            sys.exit(0)

        if "betweenness" in centralities:
            print >> sys.stderr, "betweenness"
            g.g.vs['bw'] = g.g.betweenness(weights='length', directed=True)

        #g.g.vs['ev'] = g.g.evcent(weights='weight') # eigenvector centrality

        if 'pagerank' in centralities:
            print >> sys.stderr, "pagerank"
            g.g.vs['pr'] = g.g.pagerank(weights='weight')  # pagerank

        if 'degree' in centralities:
            print >> sys.stderr, "outdegree"
            g.set_weighted_degree(type=ig.OUT)
        #total_weights = sum(g.g.es['weight'])
        max_edges = vn * (vn - 1)

        for cls, vs in g.classes.iteritems():
            if not vs:
                continue

            if "betweenness" in centralities:
                norm_betweenness = numpy.array(g.classes[cls]['bw']) \
                                   / max_edges
                print " * %s : average betweenness : %.10f" % (
                    cls, numpy.average(norm_betweenness))
                print " * %s : stddev betweenness : %.10f" % (
                    cls, numpy.sqrt(numpy.var(norm_betweenness)))
                print " * %s : max betweenness: %s" % (
                    cls, top(numpy.array(g.classes[cls]['bw']) / max_edges))

            #print " * Average eigenvector centrality : %6f" % numpy.average(
            #    g.vs['ev'])
            if 'pagerank' in centralities:
                print " * %s : average pagerank : %.10f" % (
                    cls, numpy.average(g.classes[cls]['pr']))
                print " * %s : stddev pagerank : %.10f" % (
                    cls, numpy.sqrt(numpy.var(g.classes[cls]['pr'])))
                print " * %s : max pagerank: %s" % (cls,
                                                    top(g.classes[cls]['pr']))

            if 'degree' in centralities:
                wi = g.classes[cls]['weighted_indegree']
                print " * %s : average IN degree centrality (weighted): %.10f" % (
                    cls, numpy.average(wi))
                print " * %s : stddev IN degree centrality (weighted): %.10f" % (
                    cls, numpy.sqrt(numpy.var(wi)))
                print " * %s : max IN degrees centrality (weighted): %s" % (
                    cls, top(wi))
                del wi

                wo = g.classes[cls]['weighted_outdegree']
                print " * %s : average OUT degree centrality (weighted) : %.10f" %\
                      (cls, numpy.average(wo))
                print " * %s : stddev OUT degree centrality (weighted) : %.10f" % \
                      (cls, numpy.sqrt(numpy.var(wo)))
                print " * %s : max OUT degrees centrality (weighted): %s" % (
                    cls, top(wo))
                del wo

        timr.stop('centrality')

    if options.power_law:
        with Timr('power law'):
            for cls, vs in g.classes.iteritems():
                if not vs:
                    continue

                indegrees = vs['weighted_indegree']

                try:
                    alpha_exp = ig.statistics.power_law_fit(indegrees, xmin=6)
                    print " * %s : alpha exp IN degree distribution : %10f " %\
                          (cls, alpha_exp)
                except ValueError:
                    print >> sys.stderr,\
                          " * %s : alpha exp IN degree distribution : ERROR" %\
                          (cls,)

    if options.histogram:
        list_with_index = lambda degrees, idx: [(degree, idx)
                                                for degree in degrees
                                                if degree]
        all_list = []

        nogrp_indegrees = g.g.vs.select(sysop_ne=True,
                                        bureaucrat_ne=True,
                                        steward_ne=True,
                                        founder_ne=True,
                                        bot_ne=True)['weighted_indegree']
        all_list += list_with_index(nogrp_indegrees, 1)

        sysops_indegrees = g.classes['sysop']['weighted_indegree']
        all_list += list_with_index(sysops_indegrees, 2)

        burs_indegrees = g.classes['bureaucrat']['weighted_indegree']
        all_list += list_with_index(burs_indegrees, 3)

        stewards_indegrees = g.classes['steward']['weighted_indegree']
        all_list += list_with_index(stewards_indegrees, 4)

        founders_indegrees = g.classes['founder']['weighted_indegree']
        all_list += list_with_index(founders_indegrees, 5)

        bots_indegrees = g.classes['bot']['weighted_indegree']
        all_list += list_with_index(bots_indegrees, 6)

        if options.gnuplot:
            f = open('hist.dat', 'w')
        else:
            f = open('%swiki-%s-hist.dat' % (lang, date), 'w')

        all_list.sort(reverse=True)

        for indegree, grp in all_list:
            for _ in range(grp - 1):
                print >> f, 0,
            print >> f, indegree,
            for _ in range(grp, 6):
                print >> f, 0,
            print >> f, ""
        f.close()

    if options.gnuplot:
        from popen2 import Popen3

        process = Popen3('gnuplot hist.gnuplot')
        process.wait()

        os.rename('hist.png', '%swiki-%s-hist.png' % (lang, date))
        os.rename('hist.dat', '%swiki-%s-hist.dat' % (lang, date))

    if options.plot:
        ## TODO: evaluate if this can be done with
        ## http://bazaar.launchpad.net/~igraph/igraph/0.6-main/revision/2018
        with Timr('plot'):
            import math

            ## filter:
            #print len(g.g.vs), len(g.g.es)
            #g.set_weighted_degree(type=ig.OUT)
            #g.g = g.g.subgraph(g.g.vs.select(weighted_indegree_ge=10,
            #                           weighted_outdegree_ge=1))
            #g.g.write_graphml('itwiki-20100729-stub-meta-history_in10_out1.graphml')
            #print len(g.g.vs), len(g.g.es)

            bots = g.g.vs.select(bot=True)
            bots['color'] = ('purple', ) * len(bots)
            logging.debug('bots: ok')

            anonyms = g.g.vs.select(anonymous=True)
            anonyms['color'] = ('blue', ) * len(anonyms)

            sysops = g.g.vs.select(sysop=True)
            sysops['color'] = ('yellow', ) * len(sysops)

            bur_sysops = g.g.vs.select(bureaucrat=True, sysop=True)
            bur_sysops['color'] = ('orange', ) * len(bur_sysops)

            g.g.vs['size'] = [
                math.sqrt(v['weighted_indegree'] + 1) * 10 for v in g.g.vs
            ]

            logging.debug('plot: begin')
            ig.plot(g.g,
                    target=lang + "_general.png",
                    bbox=(0, 0, 8000, 8000),
                    edge_color='grey',
                    layout='drl')
            logging.debug('plot: end')
            weights = g.g.es['weight']
            max_weight = max(weights)

            g.g.es['color'] = [(255. * e['weight'] / max_weight, 0., 0.)
                               for e in g.g.es]
            g.g.es['width'] = weights

            ig.plot(g.g,
                    target=lang + "_weighted_edges.png",
                    bbox=(0, 0, 4000, 2400),
                    layout='fr',
                    vertex_label=' ')

    if options.as_table:
        tablr.stop()

        #tablr.printHeader()
        #tablr.printData()
        tablr.saveInDjangoModel()

    if options.adjacency:
        giant = g.g.clusters().giant()
        #destAdj = "%s/%swiki-%s-adj.csv" % (os.path.split(fn)[0], lang, date)
        destAdj = "%swiki-%s-adj.csv" % (lang, date)
        #destRec = "%s/%swiki-%s-rec.csv" % (os.path.split(fn)[0], lang, date)
        destRec = "%swiki-%s-rec.csv" % (lang, date)
        sg.Graph(giant).writeAdjacencyMatrix(destAdj, 'username')
        sg.Graph(giant).writeReciprocityMatrix('username', destRec)

    if options.users_role:
        l = g.get_user_class('username',
                             ('anonymous', 'bot', 'bureaucrat', 'sysop'))

        #destUR = "%s/%swiki-%s-ur.csv" % (os.path.split(fn)[0], lang, date)
        destUR = "%swiki-%s-ur.csv" % (lang, date)
        with open(destUR, 'w') as f:
            for username, role in sorted(l):
                print >> f, "%s,%s" % (username, role)

        from random import shuffle
        #destCls = "%s/%swiki-%s-%%s.csv" % (os.path.split(fn)[0], lang, date)
        destCls = "%swiki-%s-%%s.csv" % (lang, date)
        for cls in ('anonymous', 'bot', 'bureaucrat', 'sysop', 'normal_user'):
            users = g.classes[cls]['username']
            shuffle(users)
            with open(destCls % cls, 'w') as f:
                for username in users:
                    print >> f, \
                          ("%s,http://vec.wikipedia.org/w/index.php?title=" + \
                          "Discussion_utente:%s&action=history&offset=" + \
                          "20100000000001") % (username, username)
Exemplo n.º 39
0
def main():
    import optparse

    p = optparse.OptionParser(usage="usage: %prog [-s SOURCE] [-h] file")
    p.add_option(
        "-s",
        "--source",
        metavar="SOURCE",
        dest="source",
        help="Specify a graph to use as source for attributes " + "(this will disable API calls)",
    )

    opts, files = p.parse_args()

    if not files:
        p.error("Give me a file, please ;-)")
    fn = files[0]

    lang, date, type_ = explode_dump_filename(fn)

    groups = (
        "bot",
        "sysop",
        "bureaucrat",
        "checkuser",
        "steward",
        "import",
        "transwiki",
        "uploader",
        "ipblock-exempt",
        "oversight",
        "founder",
        "rollbacker",
        "accountcreator",
        "autoreviewer",
        "abusefilter",
    )
    g = ig.load(fn)
    if opts.source:
        sourceg = ig.load(opts.source)
        for destv in g.vs:
            try:
                sourcev = sourceg.vs.select(username=destv["username"])[0]
            except IndexError:
                print destv["username"], "not found in source"
                for group in groups:
                    destv[group] = None
                continue
            for group in groups:
                destv[group] = sourcev[group]

    else:
        for group in groups:
            addGroupAttribute(g, lang, group)

        print "BLOCKED ACCOUNTS"
        addBlockedAttribute(g, lang)

    print "ANONYMOUS USERS"
    g.vs["anonymous"] = map(isip, g.vs["username"])
    g.write("%swiki-%s%s_rich.pickle" % (lang, date, type_), format="pickle")