Exemplo n.º 1
0
 def __init__(self, rts, tasks, result):
     super(EditorConsumer, self).__init__(rts, tasks, result)
     self.db_raw = storage.init_database(rts.storage, rts.dbname,
                                         rts.editors_raw)
     self.db_dataset = storage.init_database(rts.storage, rts.dbname,
                                             rts.editors_dataset)
     self.rts = rts
Exemplo n.º 2
0
def setup_database(rts):
    '''
    Initialize the database, including setting indexes and dropping the older
    version of the collection.
    '''
    db_raw = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
    db_dataset = storage.init_database(rts.storage, rts.dbname,
                                       rts.editors_dataset)
    db_dataset.drop_collection()
    editors = db_raw.retrieve_distinct_keys('user_id', force_new=True)
    return editors, db_raw, db_dataset
Exemplo n.º 3
0
def launcher():
    '''
    This is the main entry point, it creates a queue with jobs and determines
    the type of job and fires it off 
    '''
    db = storage.init_database(rts.storage, 'wikilytics', 'jobs')
    tasks = []
    project, language, parser = manager.init_args_parser()
    args = parser.parse_args(['django'])
    jobs = db.find({'finished': False, 'in_progress': False, 'error': False})
    for job in jobs:
        tasks.append(job)

    for task in tasks:
        if task['jobtype'] == 'dataset':
            print 'Launching the Editor Trends Analytics Toolkit.'
            res = launch_editor_trends_toolkit(task, args)
        else:
            print 'Launching %s.' % task['jobtype']
            res = launch_chart(task, args)

        if res:
            db.update({'_id': task['_id']}, {'$set': {'finished': True}})
        else:
            '''
            To prevent jobs from recurring non-stop, set error to True. These
            jobs will be excluded and need to be investigated to see what's
            happening. 
            '''
            db.update('_id', task['_id'], {'$set': {'error': True}})
Exemplo n.º 4
0
def store_articles(tasks, rts):
    db = storage.init_database(rts.storage, rts.dbname, rts.articles_raw)
    filename = None
    while True:
        try:
            filename = tasks.get(block=False)
            tasks.task_done()
            if filename == None:
                break
            print 'Processing %s...' % filename
            fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r',
                                                  'utf-8')
            for line in fh:
                line = line.strip()
                line = line.split('\t')
                data = {}
                x, y = 0, 1
                while y < len(line):
                    key, value = line[x], line[y]
                    if key == 'ns' or key == 'id':
                        data[key] = int(value)
                    else:
                        data[key] = value
                    x += 2
                    y += 2
                db.insert(data)
            fh.close()
        except Empty:
            pass
    print 'Finished processing %s...' % filename
Exemplo n.º 5
0
def store_json_diffs(rts):
    files = os.listdir(rts.diffs)
    #print files, rts.diffs
    db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)
    buffer = cStringIO.StringIO()

    for filename in files:
        fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r',
                                              'utf-8')
        for line in fh:
            if line.startswith('\n') or line.startswith('Start'):
                obj = buffer.getvalue()
                if obj != '':
                    obj = json.loads(obj)
                    obj[0]['article_id'] = int(obj[0]['article_id'])
                    for key, value in obj[0].iteritems():
                        if key == 'timestamp':
                            value = datetime.strptime(value,
                                                      '%Y-%m-%dT%H:%M:%S')
                        obj[0][key] = value
                    obj = obj[0]
                    try:
                        db.save(obj)
                    except bson.errors.InvalidDocument, error:
                        print error
                buffer = cStringIO.StringIO()
            else:
                buffer.write(line)
        fh.close()
Exemplo n.º 6
0
    def run(self):
        db = storage.init_database(self.rts.storage, self.rts.dbname,
                                   self.rts.editors_raw)
        editor_cache = cache.EditorCache(db)
        while True:
            try:
                filename = self.tasks.get(block=False)
                self.tasks.task_done()
                if filename == None:
                    self.result.put(None)
                    break

                fh = file_utils.create_txt_filehandle(self.rts.sorted,
                                                      filename, 'r', 'utf-8')
                data = []
                for line in file_utils.read_raw_data(fh):
                    if len(line) == 1:  # or len(line) == 4:
                        continue
                    obs = prepare_data(line)
                    if obs != {}:
                        data.append(obs)
                    if len(data) == 10000:
                        db.insert(data, safe=False)
                        data = []

                if data != []:
                    db.insert(data, safe=False)
                fh.close()
                self.result.put(True)
            except Empty:
                pass
Exemplo n.º 7
0
def preload(rts):
    collection = '%s%s_articles_raw' % (rts.language.code, rts.project.name)
    db = storage.init_database(rts.storage, rts.dbname, collection)
    data = {}
    cursor = db.find('category', 'List')
    for c in cursor:
        data[c['id']] = 1
    return data, rts
Exemplo n.º 8
0
 def to_mongo(self):
     dbname = '%s%s' % (self.language_code, self.project)
     db = storage.init_database(self.rts.storage, dbname, 'charts')
     db.add_son_manipulator(Transform())
     db.remove({
         'hash': self.hash,
         'project': self.project,
         'language_code': self.language_code
     })
     db.insert({'variables': self})
Exemplo n.º 9
0
def store_diffs_debug(rts):
    db = storage.init_database(rts)
    db.drop_collection()
    files = os.listdir(rts.diffs)
    for filename in files:
        fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r',
                                              'utf-8')
        diffs = json.load(fh)
        db.insert(diffs)
        fh.close()
Exemplo n.º 10
0
def retrieve_bots(db_type, language_code):
    '''
    Loader function to retrieve list of id's of known Wikipedia bots.
    '''
    bots = []
    db = storage.init_database(db_type, 'bots', 'ids')
    cursor = db.find()
    for bot in cursor:
        if bot['verified'] == 'True' and language_code in bot['projects']:
            bots.append(bot['name'])
    return bots
Exemplo n.º 11
0
def add_indexes(rts):
    db_dataset = storage.init_database(rts.storage, rts.dbname,
                                       rts.editors_dataset)
    print '\nCreating indexes...'
    db_dataset.add_index('user_id')
    db_dataset.add_index('new_wikipedian')
    db_dataset.add_index('username')
    db_dataset.add_index('cum_edit_count_main_ns')
    db_dataset.add_index('cum_edit_count_other_ns')
    db_dataset.add_index('first_edit')
    db_dataset.add_index('final_edit')

    print 'Finished creating indexes...'
Exemplo n.º 12
0
def launcher(rts):
    '''
    This is the main entry point and creates a number of workers and launches
    them. 
    '''
    print 'Input directory is: %s ' % rts.sorted
    db = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
    db.drop_collection()

    files = file_utils.retrieve_file_list(rts.sorted, 'csv')
    pbar = progressbar.ProgressBar(maxval=len(files)).start()

    tasks = multiprocessing.JoinableQueue()
    result = multiprocessing.JoinableQueue()

    storers = [
        Storer(rts, tasks, result) for x in xrange(rts.number_of_processes)
    ]

    for filename in files:
        tasks.put(filename)

    for x in xrange(rts.number_of_processes):
        tasks.put(None)

    for storer in storers:
        storer.start()

    ppills = rts.number_of_processes
    while ppills > 0:
        try:
            res = result.get(block=False)
            if res == True:
                pbar.update(pbar.currval + 1)
            else:
                ppills -= 1
        except Empty:
            pass

    tasks.join()
    print '\nCreating indexes...'
    db.add_index('user_id')
    db.add_index('username')
    db.add_index('article_id')
    db.add_index('reverted_by')
    db.add_index('revert')
    db.add_index('bot')
    db.add_index('date')
    db.add_index('ns')
    db.add_index('delta')
Exemplo n.º 13
0
def debug():
    db = storage.init_database('mongo', 'wikilytics', 'enwiki_charts')
    #db.add_son_manipulator(Transform())

    lock = RLock()
    v = Variable('test', 'year', lock, {})

    for x in xrange(100000):
        year = random.randrange(2005, 2010)
        month = random.randrange(1, 12)
        day = random.randrange(1, 28)
        d = datetime.datetime(year, month, day)
        x = random.randrange(1, 10000)
        v.add(d, x, {'username': '******'})
    gc.collect()
Exemplo n.º 14
0
def to_db(rts, jobtype, task, timer, event='start'):
    db = storage.init_database(rts.storage, rts.dbname, 'jobs')
    created = datetime.datetime.now()
    job = db.find_one({'hash': rts.id})
    #print job
    data = {
        'hash': rts.id,
        'created': created,
        'jobtype': jobtype,
        'in_progress': True,
        'language_code': rts.language.code,
        'project': rts.project.name,
        'tasks': {},
    }

    if job == None:
        if jobtype == 'dataset':
            data['finished'] = False
            _id = db.save(data)
        elif jobtype == 'chart':
            data['finished'] = True
            _id = db.save(data)

        job = db.find_one({'_id': _id})

    tasks = job['tasks']
    t = tasks.get(task, {})
    if event == 'start':
        t['start'] = timer.t0
        t['in_progress'] = True
        tasks[task] = t
        db.update('hash', rts.id, {'$set': {'tasks': tasks}})
        #coll.update({'hash': hash}, {'$set': {'tasks': tasks}})
    elif event == 'finish':
        t['finish'] = timer.t1
        t['in_progress'] = False
        tasks[task] = t
        if task == 'transform' or jobtype == 'chart':
            #final task, set entire task to finished
            db.update('hash', rts.id, {
                '$set': {
                    'tasks': tasks,
                    'in_progress': False,
                    'finished': True
                }
            })
        else:
            db.update('hash', rts.id, {'$set': {'tasks': tasks}})
Exemplo n.º 15
0
def store_bots():
    '''
    This file reads the results from the lookup_bot_userid function and stores
    it in a MongoDB collection.
    '''
    keys = ['name', 'verified', 'projects']
    bots = file_utils.create_dict_from_csv_file(settings.csv_location,
                                                'bots_ids.csv', 'utf-8', keys)
    db = storage.init_database(rts.storage, 'wikilytics', 'bots')
    db.drop_collection()
    for id in bots:
        bot = bots[id]
        data = dict([(k, bot[k]) for k in keys])
        data['id'] = id
        db.insert(data)

    print 'Stored %s bots' % db.count()
Exemplo n.º 16
0
def sor_newbie_treatment(editor, var, **kwargs):
    rts = kwargs.pop('rts')
    tenth_edit = editor['new_wikipedian']
    title = ':%s' % editor['username']
    collection = '%s%s_diffs_dataset' % (rts.language.code, rts.project.name)
    db = storage.init_database(rts.storage, rts.dbname, collection)

    if tenth_edit != False:
        qualifier = {'ns': 3, 'timestamp': {'$lt': tenth_edit}}
        observations = db.find_one(qualifier)
    else:
        observations = db.find_one('editor', editor)

    if observations != None:
        for obs in observations:
            if obs['ns'] == 3:
                values = obs.values()
                print values
Exemplo n.º 17
0
def generate_chart_data(rts, func, **kwargs):
    '''
    This is the entry function to be called to generate data for creating 
    charts.
    '''

    stopwatch = timer.Timer()
    plugin = retrieve_plugin(func)

    if not plugin:
        available_plugins = inventory.available_analyses()
        raise exceptions.UnknownPluginError(plugin, available_plugins)
        plugin = getattr(plugin, func)

    feedback(func, rts)

    tasks = JoinableQueue()
    result = JoinableQueue()

    mgr = Manager()
    lock = mgr.RLock()
    obs = dict()
    obs_proxy = mgr.dict(obs)

    db = storage.init_database(rts.storage, rts.dbname, rts.collection)
    editors = db.retrieve_distinct_keys('editor')
    #editors = editors[:500]
    if rts.collection.find('editors_dataset') > -1:
        min_year, max_year = determine_project_year_range(db, 'new_wikipedian')
        kwargs['min_year'] = min_year
        kwargs['max_year'] = max_year

    fmt = kwargs.pop('format', 'long')
    time_unit = kwargs.pop('time_unit', 'year')

    var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs)

    try:
        print 'Determining whether plugin requires preloaded data...'
        preloader = getattr(plugin, 'preload')
        print 'Preloading data...'
        data = preloader(rts)
    except Exception, error:
        data = None
Exemplo n.º 18
0
 def run(self):
     '''
     Generic loop function that loops over all the editors of a Wikipedia 
     project and then calls the plugin that does the actual mapping.
     '''
     db = storage.init_database(self.rts.storage, self.rts.dbname,
                                self.rts.collection)
     while True:
         try:
             editor_id = self.tasks.get(block=False)
             self.tasks.task_done()
             if editor_id == None:
                 self.result.put(self.var)
                 break
             editor = db.find_one({'editor': editor_id})
             self.plugin(self.var, editor, rts=self.rts, data=self.data)
             self.result.put(True)
         except Empty:
             pass
Exemplo n.º 19
0
def launcher_articles(rts):
    '''
    This function reads articles.csv and stores it in a separate collection.
    Besides containing the title of an article, it also includes:
    * namespace
    * category (if any)
    * article id
    * redirect (true / false)
    * timestamp article created
    '''
    db = storage.init_database(rts.storage, rts.dbname, rts.articles_raw)
    db.drop_collection()

    files = file_utils.retrieve_file_list(rts.txt,
                                          extension='csv',
                                          mask='articles')
    tasks = multiprocessing.JoinableQueue()

    print 'Storing articles...'

    for filename in files:
        tasks.put(filename)

    for x in xrange(rts.number_of_processes):
        tasks.put(None)

    storers = [
        multiprocessing.Process(target=store_articles, args=[tasks, rts])
        for x in xrange(rts.number_of_processes)
    ]

    for storer in storers:
        storer.start()

    tasks.join()

    print '\nCreating indexes...'
    db.add_index('id')
    db.add_index('title')
    db.add_index('ns')
    db.add_index('category')
Exemplo n.º 20
0
def launcher(rts):
    '''
    This function initializes the multiprocessor, and loading the queue with
    the compressed XML files. 
    '''
    input_queue = JoinableQueue()
    format = 'json'
    files = file_utils.retrieve_file_list(rts.input_location)

    if len(files) > cpu_count():
        processors = cpu_count() - 1
    else:
        processors = len(files)

    for filename in files:
        filename = os.path.join(rts.input_location, filename)
        print filename
        input_queue.put(filename)

    for x in xrange(processors):
        print 'Inserting poison pill %s...' % x
        input_queue.put(None)


#    extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
#                                                       rts, format])
#                  for process_id in xrange(processors)]
#    for extracter in extracters:
#        extracter.start()
#
#    input_queue.join()

    store_json_diffs(rts)
    db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)

    db.add_index('title')
    db.add_index('timestamp')
    db.add_index('username')
    db.add_index('ns')
    db.add_index('editor')
Exemplo n.º 21
0
def create_edgelist(project, collection):
    db = storage.init_database('mongo', project, collection)
    ids = db.retrieve_distinct_keys('editor')
    ids.sort()
    fh = file_utils.create_txt_filehandle(settings.dataset_location,
                                          '%s_edgelist.csv' % project, 'w',
                                          'utf-8')
    for i in ids:
        author_i = db.find_one({'editor': i})
        if author_i != None:
            article_i = create_articles_set(author_i['edits'])
            for j in ids:
                if i > j:
                    author_j = db.find_one({'editor': j})
                    article_j = create_articles_set(author_j['edits'])
                    common = article_i.intersection(article_j)
                    if len(common) > 0:
                        file_utils.write_list_to_csv([i, j, len(common)],
                                                     fh,
                                                     recursive=False,
                                                     newline=True)
    fh.close()
Exemplo n.º 22
0
 def store_available_dumps(self):
     db = storage.init_database(rts.storage, 'wikilytics',
                                'available_dumps')
     db.save({'project': self.project, 'dumps': self.data})
Exemplo n.º 23
0
            fh.write('\n')


def write_revision(dataset, revision):
    for i, key in enumerate(keys):
        if type(revision[key]) == type(0):
            revision[key] = str(revision[key])
        dataset.write('%s' % revision[key].decode('utf-8'))
        if (i + 1) != len(keys):
            dataset.write('\t')
        else:
            dataset.write('\n')


print 'Constructing training dataset...'
db_dataset = storage.init_database('mongo', 'wikilytics',
                                   'enwiki_editors_dataset')
print 'Loading editors...'
if not os.path.exists('set_a.bin'):
    pre_editors, post_editors = determine_editors(db_dataset)
    fh = open('set_a.bin', 'wb')
    cPickle.dump(pre_editors, fh)
    fh.close()

    fh = open('set_b.bin', 'wb')
    cPickle.dump(post_editors, fh)
    fh.close()
else:
    pre_editors = load_binary_file('set_a.bin')
    post_editors = load_binary_file('set_b.bin')

dataset = codecs.open('training.tsv', 'w', 'utf-8')
])
__email__ = 'dvanliere at gmail dot com'
__date__ = '2011-04-20'
__version__ = '0.1'

import sys
import os
from datetime import datetime
if '..' not in sys.path:
    sys.path.append('..%s..%s' % (os.sep, os.sep))

from classes import storage
from classes import settings

rts = settings.Settings()
db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset')
location = os.path.join(rts.csv_location, 'd_20110502.tsv')

fh = open(location, 'r')
for i, line in enumerate(fh):
    if i == 0:
        continue
    line = line.strip()
    line = line.replace("'", '')
    line = line.split('\t')
    id = line[0]
    id = id[:-1]
    if line[1] == 'None':
        continue
    date = datetime.strptime(line[1][:8], '%Y%m%d')
    if i % 1000 == 0:
Exemplo n.º 25
0
 def __init__(self, rts, tasks, result):
     self.db_raw = storage.init_database(rts.storage, rts.dbname,
                                         rts.editors_raw)
     self.db_dataset = storage.init_database(rts.storage, rts.dbname,
                                             rts.editors_dataset)