def launcher(rts): ''' This function initializes the multiprocessor, and loading the queue with the compressed XML files. ''' input_queue = JoinableQueue() files = file_utils.retrieve_file_list(rts.input_location) if len(files) > cpu_count(): processors = cpu_count() - 1 else: processors = len(files) fhd = buffer.FileHandleDistributor(rts.max_filehandles, processors) for filename in files: filename = os.path.join(rts.input_location, filename) print filename input_queue.put(filename) for x in xrange(processors): print 'Inserting poison pill %s...' % x input_queue.put(None) extracters = [ Process(target=stream_raw_xml, args=[input_queue, process_id, fhd, rts]) for process_id in xrange(processors) ] for extracter in extracters: extracter.start() input_queue.join()
def launcher(rts): ''' rts is an instance of RunTimeSettings ''' files = file_utils.retrieve_file_list(rts.txt, 'csv') pbar = progressbar.ProgressBar(maxval=len(files)).start() tasks = multiprocessing.JoinableQueue() result = multiprocessing.JoinableQueue() number_of_processes = 2 sorters = [Sorter(rts, tasks, result) for x in xrange(number_of_processes)] for filename in files: tasks.put(filename) for x in xrange(number_of_processes): tasks.put(None) for sorter in sorters: sorter.start() ppills = number_of_processes while ppills > 0: try: res = result.get() if res == True: pbar.update(pbar.currval + 1) else: ppills -= 1 except Empty: pass tasks.join()
def launcher_simple(): location = 'c:\\wikimedia\\nl\\wiki\\' output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\' files = file_utils.retrieve_file_list(location) process_id = 0 format = 'json' for filename in files: fh = file_utils.create_streaming_buffer( os.path.join(location, filename)) parse_xml(fh, format, process_id, output_location) fh.close()
def launcher(rts): ''' This is the main entry point and creates a number of workers and launches them. ''' print 'Input directory is: %s ' % rts.sorted db = storage.init_database(rts.storage, rts.dbname, rts.editors_raw) db.drop_collection() files = file_utils.retrieve_file_list(rts.sorted, 'csv') pbar = progressbar.ProgressBar(maxval=len(files)).start() tasks = multiprocessing.JoinableQueue() result = multiprocessing.JoinableQueue() storers = [ Storer(rts, tasks, result) for x in xrange(rts.number_of_processes) ] for filename in files: tasks.put(filename) for x in xrange(rts.number_of_processes): tasks.put(None) for storer in storers: storer.start() ppills = rts.number_of_processes while ppills > 0: try: res = result.get(block=False) if res == True: pbar.update(pbar.currval + 1) else: ppills -= 1 except Empty: pass tasks.join() print '\nCreating indexes...' db.add_index('user_id') db.add_index('username') db.add_index('article_id') db.add_index('reverted_by') db.add_index('revert') db.add_index('bot') db.add_index('date') db.add_index('ns') db.add_index('delta')
def launcher_articles(rts): ''' This function reads articles.csv and stores it in a separate collection. Besides containing the title of an article, it also includes: * namespace * category (if any) * article id * redirect (true / false) * timestamp article created ''' db = storage.init_database(rts.storage, rts.dbname, rts.articles_raw) db.drop_collection() files = file_utils.retrieve_file_list(rts.txt, extension='csv', mask='articles') tasks = multiprocessing.JoinableQueue() print 'Storing articles...' for filename in files: tasks.put(filename) for x in xrange(rts.number_of_processes): tasks.put(None) storers = [ multiprocessing.Process(target=store_articles, args=[tasks, rts]) for x in xrange(rts.number_of_processes) ] for storer in storers: storer.start() tasks.join() print '\nCreating indexes...' db.add_index('id') db.add_index('title') db.add_index('ns') db.add_index('category')
def launcher(rts): ''' This function initializes the multiprocessor, and loading the queue with the compressed XML files. ''' input_queue = JoinableQueue() format = 'json' files = file_utils.retrieve_file_list(rts.input_location) if len(files) > cpu_count(): processors = cpu_count() - 1 else: processors = len(files) for filename in files: filename = os.path.join(rts.input_location, filename) print filename input_queue.put(filename) for x in xrange(processors): print 'Inserting poison pill %s...' % x input_queue.put(None) # extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id, # rts, format]) # for process_id in xrange(processors)] # for extracter in extracters: # extracter.start() # # input_queue.join() store_json_diffs(rts) db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset) db.add_index('title') db.add_index('timestamp') db.add_index('username') db.add_index('ns') db.add_index('editor')
def bot_launcher(language_code, project, target, action, single=False, manager=False): ''' This function sets the stage to launch bot id detection and collecting data to discover new bots. ''' file_utils.delete_file(settings.csv_location, 'bots_ids.csv') location = os.path.join(settings.input_location, language_code, project) input_xml = os.path.join(location, 'chunks') input_txt = os.path.join(location, 'txt') tasks = multiprocessing.JoinableQueue() mgr = multiprocessing.Manager() keys = ['id', 'name', 'verified', 'projects'] if action == 'lookup': output_file = 'bots_ids.csv' files = file_utils.retrieve_file_list(input_txt, 'txt', mask=None) input_queue = pc.load_queue(files, poison_pill=True) bots = read_bots_csv_file(settings.csv_location, 'Bots.csv', 'utf-8', manager=manager) for file in files: tasks.put( consumers.TXTFile(file, input_txt, settings.csv_location, output_file, target, bots=bots, keys=keys)) else: output_file = 'bots_predictionset.csv' files = file_utils.retrieve_file_list(input_xml, 'xml', mask=None) bots = {} for file in files: tasks.put( consumers.XMLFile(file, input_xml, settings.csv_location, output_file, target, bots=bots, keys=keys)) #lock = mgr.Lock() if manager: manager = mgr tracker = {} if single: while True: try: print '%s files left in the queue...' % messages.show( tasks.qsize) task = tasks.get(block=False) bots = task(bots) except Empty: break else: bot_launcher_multi(tasks) file_utils.store_object(bots, settings.binary_location, 'bots.bin') if action == 'lookup': store_bots() if bots != {}: print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len( bots) keys = bots.keys() for key in keys: try: print '%s' % key.encode('utf-8') except: pass else: bot_training_dataset(bots)