Пример #1
0
def launcher(rts):
    '''
    This function initializes the multiprocessor, and loading the queue with
    the compressed XML files. 
    '''
    input_queue = JoinableQueue()

    files = file_utils.retrieve_file_list(rts.input_location)

    if len(files) > cpu_count():
        processors = cpu_count() - 1
    else:
        processors = len(files)

    fhd = buffer.FileHandleDistributor(rts.max_filehandles, processors)

    for filename in files:
        filename = os.path.join(rts.input_location, filename)
        print filename
        input_queue.put(filename)

    for x in xrange(processors):
        print 'Inserting poison pill %s...' % x
        input_queue.put(None)

    extracters = [
        Process(target=stream_raw_xml,
                args=[input_queue, process_id, fhd, rts])
        for process_id in xrange(processors)
    ]
    for extracter in extracters:
        extracter.start()

    input_queue.join()
Пример #2
0
def launcher(rts):
    '''
    rts is an instance of RunTimeSettings
    '''
    files = file_utils.retrieve_file_list(rts.txt, 'csv')
    pbar = progressbar.ProgressBar(maxval=len(files)).start()
    tasks = multiprocessing.JoinableQueue()
    result = multiprocessing.JoinableQueue()
    number_of_processes = 2
    sorters = [Sorter(rts, tasks, result) for x in xrange(number_of_processes)]

    for filename in files:
        tasks.put(filename)

    for x in xrange(number_of_processes):
        tasks.put(None)

    for sorter in sorters:
        sorter.start()

    ppills = number_of_processes
    while ppills > 0:
        try:
            res = result.get()
            if res == True:
                pbar.update(pbar.currval + 1)
            else:
                ppills -= 1
        except Empty:
            pass

    tasks.join()
Пример #3
0
def launcher_simple():
    location = 'c:\\wikimedia\\nl\\wiki\\'
    output_location = 'c:\\wikimedia\\nl\\wiki\\diffs\\'
    files = file_utils.retrieve_file_list(location)
    process_id = 0
    format = 'json'
    for filename in files:
        fh = file_utils.create_streaming_buffer(
            os.path.join(location, filename))
        parse_xml(fh, format, process_id, output_location)
        fh.close()
Пример #4
0
def launcher(rts):
    '''
    This is the main entry point and creates a number of workers and launches
    them. 
    '''
    print 'Input directory is: %s ' % rts.sorted
    db = storage.init_database(rts.storage, rts.dbname, rts.editors_raw)
    db.drop_collection()

    files = file_utils.retrieve_file_list(rts.sorted, 'csv')
    pbar = progressbar.ProgressBar(maxval=len(files)).start()

    tasks = multiprocessing.JoinableQueue()
    result = multiprocessing.JoinableQueue()

    storers = [
        Storer(rts, tasks, result) for x in xrange(rts.number_of_processes)
    ]

    for filename in files:
        tasks.put(filename)

    for x in xrange(rts.number_of_processes):
        tasks.put(None)

    for storer in storers:
        storer.start()

    ppills = rts.number_of_processes
    while ppills > 0:
        try:
            res = result.get(block=False)
            if res == True:
                pbar.update(pbar.currval + 1)
            else:
                ppills -= 1
        except Empty:
            pass

    tasks.join()
    print '\nCreating indexes...'
    db.add_index('user_id')
    db.add_index('username')
    db.add_index('article_id')
    db.add_index('reverted_by')
    db.add_index('revert')
    db.add_index('bot')
    db.add_index('date')
    db.add_index('ns')
    db.add_index('delta')
Пример #5
0
def launcher_articles(rts):
    '''
    This function reads articles.csv and stores it in a separate collection.
    Besides containing the title of an article, it also includes:
    * namespace
    * category (if any)
    * article id
    * redirect (true / false)
    * timestamp article created
    '''
    db = storage.init_database(rts.storage, rts.dbname, rts.articles_raw)
    db.drop_collection()

    files = file_utils.retrieve_file_list(rts.txt,
                                          extension='csv',
                                          mask='articles')
    tasks = multiprocessing.JoinableQueue()

    print 'Storing articles...'

    for filename in files:
        tasks.put(filename)

    for x in xrange(rts.number_of_processes):
        tasks.put(None)

    storers = [
        multiprocessing.Process(target=store_articles, args=[tasks, rts])
        for x in xrange(rts.number_of_processes)
    ]

    for storer in storers:
        storer.start()

    tasks.join()

    print '\nCreating indexes...'
    db.add_index('id')
    db.add_index('title')
    db.add_index('ns')
    db.add_index('category')
Пример #6
0
def launcher(rts):
    '''
    This function initializes the multiprocessor, and loading the queue with
    the compressed XML files. 
    '''
    input_queue = JoinableQueue()
    format = 'json'
    files = file_utils.retrieve_file_list(rts.input_location)

    if len(files) > cpu_count():
        processors = cpu_count() - 1
    else:
        processors = len(files)

    for filename in files:
        filename = os.path.join(rts.input_location, filename)
        print filename
        input_queue.put(filename)

    for x in xrange(processors):
        print 'Inserting poison pill %s...' % x
        input_queue.put(None)


#    extracters = [Process(target=stream_raw_xml, args=[input_queue, process_id,
#                                                       rts, format])
#                  for process_id in xrange(processors)]
#    for extracter in extracters:
#        extracter.start()
#
#    input_queue.join()

    store_json_diffs(rts)
    db = storage.init_database(rts.storage, rts.dbname, rts.diffs_dataset)

    db.add_index('title')
    db.add_index('timestamp')
    db.add_index('username')
    db.add_index('ns')
    db.add_index('editor')
Пример #7
0
def bot_launcher(language_code,
                 project,
                 target,
                 action,
                 single=False,
                 manager=False):
    '''
    This function sets the stage to launch bot id detection and collecting data
    to discover new bots.
    '''
    file_utils.delete_file(settings.csv_location, 'bots_ids.csv')
    location = os.path.join(settings.input_location, language_code, project)
    input_xml = os.path.join(location, 'chunks')
    input_txt = os.path.join(location, 'txt')

    tasks = multiprocessing.JoinableQueue()
    mgr = multiprocessing.Manager()
    keys = ['id', 'name', 'verified', 'projects']

    if action == 'lookup':
        output_file = 'bots_ids.csv'
        files = file_utils.retrieve_file_list(input_txt, 'txt', mask=None)
        input_queue = pc.load_queue(files, poison_pill=True)
        bots = read_bots_csv_file(settings.csv_location,
                                  'Bots.csv',
                                  'utf-8',
                                  manager=manager)
        for file in files:
            tasks.put(
                consumers.TXTFile(file,
                                  input_txt,
                                  settings.csv_location,
                                  output_file,
                                  target,
                                  bots=bots,
                                  keys=keys))

    else:
        output_file = 'bots_predictionset.csv'
        files = file_utils.retrieve_file_list(input_xml, 'xml', mask=None)
        bots = {}
        for file in files:
            tasks.put(
                consumers.XMLFile(file,
                                  input_xml,
                                  settings.csv_location,
                                  output_file,
                                  target,
                                  bots=bots,
                                  keys=keys))

    #lock = mgr.Lock()
    if manager:
        manager = mgr

    tracker = {}
    if single:
        while True:
            try:
                print '%s files left in the queue...' % messages.show(
                    tasks.qsize)
                task = tasks.get(block=False)
                bots = task(bots)
            except Empty:
                break
    else:
        bot_launcher_multi(tasks)

    file_utils.store_object(bots, settings.binary_location, 'bots.bin')
    if action == 'lookup':
        store_bots()
        if bots != {}:
            print 'The script was unable to retrieve the user id\s for the following %s bots:\n' % len(
                bots)
            keys = bots.keys()
            for key in keys:
                try:
                    print '%s' % key.encode('utf-8')
                except:
                    pass
    else:
        bot_training_dataset(bots)