def __init__(self, rts, tasks, result): super(EditorConsumer, self).__init__(rts, tasks, result) self.db_raw = storage.init_database(rts.storage, rts.dbname, rts.editors_raw) self.db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) self.rts = rts
def setup_database(rts): ''' Initialize the database, including setting indexes and dropping the older version of the collection. ''' db_raw = storage.init_database(rts.storage, rts.dbname, rts.editors_raw) db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) db_dataset.drop_collection() editors = db_raw.retrieve_distinct_keys('user_id', force_new=True) return editors, db_raw, db_dataset
def launcher(): ''' This is the main entry point, it creates a queue with jobs and determines the type of job and fires it off ''' db = storage.init_database(rts.storage, 'wikilytics', 'jobs') tasks = [] project, language, parser = manager.init_args_parser() args = parser.parse_args(['django']) jobs = db.find({'finished': False, 'in_progress': False, 'error': False}) for job in jobs: tasks.append(job) for task in tasks: if task['jobtype'] == 'dataset': print 'Launching the Editor Trends Analytics Toolkit.' res = launch_editor_trends_toolkit(task, args) else: print 'Launching %s.' % task['jobtype'] res = launch_chart(task, args) if res: db.update({'_id': task['_id']}, {'$set': {'finished': True}}) else: ''' To prevent jobs from recurring non-stop, set error to True. These jobs will be excluded and need to be investigated to see what's happening. ''' db.update('_id', task['_id'], {'$set': {'error': True}})
def store_articles(tasks, rts): db = storage.init_database(rts.storage, rts.dbname, rts.articles_raw) filename = None while True: try: filename = tasks.get(block=False) tasks.task_done() if filename == None: break print 'Processing %s...' % filename fh = file_utils.create_txt_filehandle(rts.txt, filename, 'r', 'utf-8') for line in fh: line = line.strip() line = line.split('\t') data = {} x, y = 0, 1 while y < len(line): key, value = line[x], line[y] if key == 'ns' or key == 'id': data[key] = int(value) else: data[key] = value x += 2 y += 2 db.insert(data) fh.close() except Empty: pass print 'Finished processing %s...' % filename
def store_json_diffs(rts): files = os.listdir(rts.diffs) #print files, rts.diffs db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset) buffer = cStringIO.StringIO() for filename in files: fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8') for line in fh: if line.startswith('\n') or line.startswith('Start'): obj = buffer.getvalue() if obj != '': obj = json.loads(obj) obj[0]['article_id'] = int(obj[0]['article_id']) for key, value in obj[0].iteritems(): if key == 'timestamp': value = datetime.strptime(value, '%Y-%m-%dT%H:%M:%S') obj[0][key] = value obj = obj[0] try: db.save(obj) except bson.errors.InvalidDocument, error: print error buffer = cStringIO.StringIO() else: buffer.write(line) fh.close()
def run(self): db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.editors_raw) editor_cache = cache.EditorCache(db) while True: try: filename = self.tasks.get(block=False) self.tasks.task_done() if filename == None: self.result.put(None) break fh = file_utils.create_txt_filehandle(self.rts.sorted, filename, 'r', 'utf-8') data = [] for line in file_utils.read_raw_data(fh): if len(line) == 1: # or len(line) == 4: continue obs = prepare_data(line) if obs != {}: data.append(obs) if len(data) == 10000: db.insert(data, safe=False) data = [] if data != []: db.insert(data, safe=False) fh.close() self.result.put(True) except Empty: pass
def preload(rts): collection = '%s%s_articles_raw' % (rts.language.code, rts.project.name) db = storage.init_database(rts.storage, rts.dbname, collection) data = {} cursor = db.find('category', 'List') for c in cursor: data[c['id']] = 1 return data, rts
def to_mongo(self): dbname = '%s%s' % (self.language_code, self.project) db = storage.init_database(self.rts.storage, dbname, 'charts') db.add_son_manipulator(Transform()) db.remove({ 'hash': self.hash, 'project': self.project, 'language_code': self.language_code }) db.insert({'variables': self})
def store_diffs_debug(rts): db = storage.init_database(rts) db.drop_collection() files = os.listdir(rts.diffs) for filename in files: fh = file_utils.create_txt_filehandle(rts.diffs, filename, 'r', 'utf-8') diffs = json.load(fh) db.insert(diffs) fh.close()
def retrieve_bots(db_type, language_code): ''' Loader function to retrieve list of id's of known Wikipedia bots. ''' bots = [] db = storage.init_database(db_type, 'bots', 'ids') cursor = db.find() for bot in cursor: if bot['verified'] == 'True' and language_code in bot['projects']: bots.append(bot['name']) return bots
def add_indexes(rts): db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset) print '\nCreating indexes...' db_dataset.add_index('user_id') db_dataset.add_index('new_wikipedian') db_dataset.add_index('username') db_dataset.add_index('cum_edit_count_main_ns') db_dataset.add_index('cum_edit_count_other_ns') db_dataset.add_index('first_edit') db_dataset.add_index('final_edit') print 'Finished creating indexes...'
def launcher(rts): ''' This is the main entry point and creates a number of workers and launches them. ''' print 'Input directory is: %s ' % rts.sorted db = storage.init_database(rts.storage, rts.dbname, rts.editors_raw) db.drop_collection() files = file_utils.retrieve_file_list(rts.sorted, 'csv') pbar = progressbar.ProgressBar(maxval=len(files)).start() tasks = multiprocessing.JoinableQueue() result = multiprocessing.JoinableQueue() storers = [ Storer(rts, tasks, result) for x in xrange(rts.number_of_processes) ] for filename in files: tasks.put(filename) for x in xrange(rts.number_of_processes): tasks.put(None) for storer in storers: storer.start() ppills = rts.number_of_processes while ppills > 0: try: res = result.get(block=False) if res == True: pbar.update(pbar.currval + 1) else: ppills -= 1 except Empty: pass tasks.join() print '\nCreating indexes...' db.add_index('user_id') db.add_index('username') db.add_index('article_id') db.add_index('reverted_by') db.add_index('revert') db.add_index('bot') db.add_index('date') db.add_index('ns') db.add_index('delta')
def debug(): db = storage.init_database('mongo', 'wikilytics', 'enwiki_charts') #db.add_son_manipulator(Transform()) lock = RLock() v = Variable('test', 'year', lock, {}) for x in xrange(100000): year = random.randrange(2005, 2010) month = random.randrange(1, 12) day = random.randrange(1, 28) d = datetime.datetime(year, month, day) x = random.randrange(1, 10000) v.add(d, x, {'username': '******'}) gc.collect()
def to_db(rts, jobtype, task, timer, event='start'): db = storage.init_database(rts.storage, rts.dbname, 'jobs') created = datetime.datetime.now() job = db.find_one({'hash': rts.id}) #print job data = { 'hash': rts.id, 'created': created, 'jobtype': jobtype, 'in_progress': True, 'language_code': rts.language.code, 'project': rts.project.name, 'tasks': {}, } if job == None: if jobtype == 'dataset': data['finished'] = False _id = db.save(data) elif jobtype == 'chart': data['finished'] = True _id = db.save(data) job = db.find_one({'_id': _id}) tasks = job['tasks'] t = tasks.get(task, {}) if event == 'start': t['start'] = timer.t0 t['in_progress'] = True tasks[task] = t db.update('hash', rts.id, {'$set': {'tasks': tasks}}) #coll.update({'hash': hash}, {'$set': {'tasks': tasks}}) elif event == 'finish': t['finish'] = timer.t1 t['in_progress'] = False tasks[task] = t if task == 'transform' or jobtype == 'chart': #final task, set entire task to finished db.update('hash', rts.id, { '$set': { 'tasks': tasks, 'in_progress': False, 'finished': True } }) else: db.update('hash', rts.id, {'$set': {'tasks': tasks}})
def store_bots(): ''' This file reads the results from the lookup_bot_userid function and stores it in a MongoDB collection. ''' keys = ['name', 'verified', 'projects'] bots = file_utils.create_dict_from_csv_file(settings.csv_location, 'bots_ids.csv', 'utf-8', keys) db = storage.init_database(rts.storage, 'wikilytics', 'bots') db.drop_collection() for id in bots: bot = bots[id] data = dict([(k, bot[k]) for k in keys]) data['id'] = id db.insert(data) print 'Stored %s bots' % db.count()
def sor_newbie_treatment(editor, var, **kwargs): rts = kwargs.pop('rts') tenth_edit = editor['new_wikipedian'] title = ':%s' % editor['username'] collection = '%s%s_diffs_dataset' % (rts.language.code, rts.project.name) db = storage.init_database(rts.storage, rts.dbname, collection) if tenth_edit != False: qualifier = {'ns': 3, 'timestamp': {'$lt': tenth_edit}} observations = db.find_one(qualifier) else: observations = db.find_one('editor', editor) if observations != None: for obs in observations: if obs['ns'] == 3: values = obs.values() print values
def generate_chart_data(rts, func, **kwargs): ''' This is the entry function to be called to generate data for creating charts. ''' stopwatch = timer.Timer() plugin = retrieve_plugin(func) if not plugin: available_plugins = inventory.available_analyses() raise exceptions.UnknownPluginError(plugin, available_plugins) plugin = getattr(plugin, func) feedback(func, rts) tasks = JoinableQueue() result = JoinableQueue() mgr = Manager() lock = mgr.RLock() obs = dict() obs_proxy = mgr.dict(obs) db = storage.init_database(rts.storage, rts.dbname, rts.collection) editors = db.retrieve_distinct_keys('editor') #editors = editors[:500] if rts.collection.find('editors_dataset') > -1: min_year, max_year = determine_project_year_range(db, 'new_wikipedian') kwargs['min_year'] = min_year kwargs['max_year'] = max_year fmt = kwargs.pop('format', 'long') time_unit = kwargs.pop('time_unit', 'year') var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs) try: print 'Determining whether plugin requires preloaded data...' preloader = getattr(plugin, 'preload') print 'Preloading data...' data = preloader(rts) except Exception, error: data = None
def run(self): ''' Generic loop function that loops over all the editors of a Wikipedia project and then calls the plugin that does the actual mapping. ''' db = storage.init_database(self.rts.storage, self.rts.dbname, self.rts.collection) while True: try: editor_id = self.tasks.get(block=False) self.tasks.task_done() if editor_id == None: self.result.put(self.var) break editor = db.find_one({'editor': editor_id}) self.plugin(self.var, editor, rts=self.rts, data=self.data) self.result.put(True) except Empty: pass
def launcher_articles(rts): ''' This function reads articles.csv and stores it in a separate collection. Besides containing the title of an article, it also includes: * namespace * category (if any) * article id * redirect (true / false) * timestamp article created ''' db = storage.init_database(rts.storage, rts.dbname, rts.articles_raw) db.drop_collection() files = file_utils.retrieve_file_list(rts.txt, extension='csv', mask='articles') tasks = multiprocessing.JoinableQueue() print 'Storing articles...' for filename in files: tasks.put(filename) for x in xrange(rts.number_of_processes): tasks.put(None) storers = [ multiprocessing.Process(target=store_articles, args=[tasks, rts]) for x in xrange(rts.number_of_processes) ] for storer in storers: storer.start() tasks.join() print '\nCreating indexes...' db.add_index('id') db.add_index('title') db.add_index('ns') db.add_index('category')
def launcher(rts): ''' This function initializes the multiprocessor, and loading the queue with the compressed XML files. ''' input_queue = JoinableQueue() format = 'json' files = file_utils.retrieve_file_list(rts.input_location) if len(files) > cpu_count(): processors = cpu_count() - 1 else: processors = len(files) for filename in files: filename = os.path.join(rts.input_location, filename) print filename input_queue.put(filename) for x in xrange(processors): print 'Inserting poison pill %s...' % x input_queue.put(None) # extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id, # rts, format]) # for process_id in xrange(processors)] # for extracter in extracters: # extracter.start() # # input_queue.join() store_json_diffs(rts) db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset) db.add_index('title') db.add_index('timestamp') db.add_index('username') db.add_index('ns') db.add_index('editor')
def create_edgelist(project, collection): db = storage.init_database('mongo', project, collection) ids = db.retrieve_distinct_keys('editor') ids.sort() fh = file_utils.create_txt_filehandle(settings.dataset_location, '%s_edgelist.csv' % project, 'w', 'utf-8') for i in ids: author_i = db.find_one({'editor': i}) if author_i != None: article_i = create_articles_set(author_i['edits']) for j in ids: if i > j: author_j = db.find_one({'editor': j}) article_j = create_articles_set(author_j['edits']) common = article_i.intersection(article_j) if len(common) > 0: file_utils.write_list_to_csv([i, j, len(common)], fh, recursive=False, newline=True) fh.close()
def store_available_dumps(self): db = storage.init_database(rts.storage, 'wikilytics', 'available_dumps') db.save({'project': self.project, 'dumps': self.data})
fh.write('\n') def write_revision(dataset, revision): for i, key in enumerate(keys): if type(revision[key]) == type(0): revision[key] = str(revision[key]) dataset.write('%s' % revision[key].decode('utf-8')) if (i + 1) != len(keys): dataset.write('\t') else: dataset.write('\n') print 'Constructing training dataset...' db_dataset = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset') print 'Loading editors...' if not os.path.exists('set_a.bin'): pre_editors, post_editors = determine_editors(db_dataset) fh = open('set_a.bin', 'wb') cPickle.dump(pre_editors, fh) fh.close() fh = open('set_b.bin', 'wb') cPickle.dump(post_editors, fh) fh.close() else: pre_editors = load_binary_file('set_a.bin') post_editors = load_binary_file('set_b.bin') dataset = codecs.open('training.tsv', 'w', 'utf-8')
]) __email__ = 'dvanliere at gmail dot com' __date__ = '2011-04-20' __version__ = '0.1' import sys import os from datetime import datetime if '..' not in sys.path: sys.path.append('..%s..%s' % (os.sep, os.sep)) from classes import storage from classes import settings rts = settings.Settings() db = storage.init_database('mongo', 'wikilytics', 'enwiki_editors_dataset') location = os.path.join(rts.csv_location, 'd_20110502.tsv') fh = open(location, 'r') for i, line in enumerate(fh): if i == 0: continue line = line.strip() line = line.replace("'", '') line = line.split('\t') id = line[0] id = id[:-1] if line[1] == 'None': continue date = datetime.strptime(line[1][:8], '%Y%m%d') if i % 1000 == 0:
def __init__(self, rts, tasks, result): self.db_raw = storage.init_database(rts.storage, rts.dbname, rts.editors_raw) self.db_dataset = storage.init_database(rts.storage, rts.dbname, rts.editors_dataset)