示例#1
0
def init(options):
    global TOK
    TOK = SpacyTokenizer(**options)
    Finalize(TOK, TOK.shutdown, exitpriority=100)
示例#2
0
 def __init__(self, *args, **kwargs):
     self._dirty = set()
     self._finalize = Finalize(self, self.sync, exitpriority=5)
     Scheduler.__init__(self, *args, **kwargs)
     self.max_interval = 5
示例#3
0
def init(tokenizer_class, annotators):
    global PROCESS_TOK
    PROCESS_TOK = tokenizer_class(annotators=annotators)
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
def init():
    global PROCESS_TOK
    PROCESS_TOK = SimpleTokenizer()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
示例#5
0
def init():
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = CoreNLPTokenizer()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = DocDB()
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
示例#6
0
def init(tokenizer_class, tokenizer_opts):
    global PROCESS_TOK
    PROCESS_TOK = tokenizer_class(**tokenizer_opts)
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
示例#7
0
def init():
    global TOK
    TOK = SpacyTokenizer(annotators=ANNTOTORS)
    Finalize(TOK, TOK.shutdown, exitpriority=100)
示例#8
0
def main(args):
    # --------------------------------------------------------------------------
    # TOK
    global PROCESS_TOK
    tok_class = tokenizers.get_class("corenlp")
    tok_opts = {}
    PROCESS_TOK = tok_class(**tok_opts)
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)

    # DATA
    logger.info('-' * 100)
    logger.info('Load data files')
    dataset = args.dataset  #'quasart'#'searchqa'#'unftriviaqa'#'squad'#
    filename_train_docs = sys_dir + "/data/datasets/" + dataset + "/train.json"
    filename_dev_docs = sys_dir + "/data/datasets/" + dataset + "/dev.json"
    filename_test_docs = sys_dir + "/data/datasets/" + dataset + "/test.json"
    train_docs, train_questions = utils.load_data_with_doc(
        args, filename_train_docs)
    logger.info(len(train_docs))
    filename_train = sys_dir + "/data/datasets/" + dataset + "/train.txt"
    filename_dev = sys_dir + "/data/datasets/" + dataset + "/dev.txt"
    train_exs_with_doc = read_data(filename_train, train_questions)

    logger.info('Num train examples = %d' % len(train_exs_with_doc))

    dev_docs, dev_questions = utils.load_data_with_doc(args, filename_dev_docs)
    logger.info(len(dev_docs))
    dev_exs_with_doc = read_data(filename_dev, dev_questions)
    logger.info('Num dev examples = %d' % len(dev_exs_with_doc))

    test_docs, test_questions = utils.load_data_with_doc(
        args, filename_test_docs)
    logger.info(len(test_docs))
    test_exs_with_doc = read_data(
        sys_dir + "/data/datasets/" + dataset + "/test.txt", test_questions)
    logger.info('Num dev examples = %d' % len(test_exs_with_doc))

    # --------------------------------------------------------------------------
    # MODEL
    logger.info('-' * 100)
    start_epoch = 0
    if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'):
        # Just resume training, no modifications.
        logger.info('Found a checkpoint...')
        checkpoint_file = args.model_file + '.checkpoint'
        model, start_epoch = DocReader.load_checkpoint(checkpoint_file)
        #model = DocReader.load(checkpoint_file, args)
        start_epoch = 0
    else:
        # Training starts fresh. But the model state is either pretrained or
        # newly (randomly) initialized.
        if args.pretrained:
            logger.info('Using pretrained model...')
            model = DocReader.load(args.pretrained, args)
            if args.expand_dictionary:
                logger.info('Expanding dictionary for new data...')
                # Add words in training + dev examples
                words = utils.load_words(args, train_exs + dev_exs)
                added = model.expand_dictionary(words)
                # Load pretrained embeddings for added words
                if args.embedding_file:
                    model.load_embeddings(added, args.embedding_file)

        else:
            logger.info('Training model from scratch...')
            model = init_from_scratch(args, train_docs)  #, train_exs, dev_exs)

        # Set up optimizer
        model.init_optimizer()

    # Use the GPU?
    if args.cuda:
        model.cuda()

    # Use multiple GPUs?
    if args.parallel:
        model.parallelize()

    # --------------------------------------------------------------------------
    # DATA ITERATORS
    # Two datasets: train and dev. If we sort by length it's faster.
    logger.info('-' * 100)
    logger.info('Make data loaders')

    train_dataset_with_doc = data.ReaderDataset_with_Doc(train_exs_with_doc,
                                                         model,
                                                         train_docs,
                                                         single_answer=True)
    train_sampler_with_doc = torch.utils.data.sampler.SequentialSampler(
        train_dataset_with_doc)
    train_loader_with_doc = torch.utils.data.DataLoader(
        train_dataset_with_doc,
        batch_size=args.batch_size,
        sampler=train_sampler_with_doc,
        num_workers=args.data_workers,
        collate_fn=vector.batchify_with_docs,
        pin_memory=args.cuda,
    )

    dev_dataset_with_doc = data.ReaderDataset_with_Doc(dev_exs_with_doc,
                                                       model,
                                                       dev_docs,
                                                       single_answer=False)
    dev_sampler_with_doc = torch.utils.data.sampler.SequentialSampler(
        dev_dataset_with_doc)
    dev_loader_with_doc = torch.utils.data.DataLoader(
        dev_dataset_with_doc,
        batch_size=args.test_batch_size,
        sampler=dev_sampler_with_doc,
        num_workers=args.data_workers,
        collate_fn=vector.batchify_with_docs,
        pin_memory=args.cuda,
    )

    test_dataset_with_doc = data.ReaderDataset_with_Doc(test_exs_with_doc,
                                                        model,
                                                        test_docs,
                                                        single_answer=False)
    test_sampler_with_doc = torch.utils.data.sampler.SequentialSampler(
        test_dataset_with_doc)
    test_loader_with_doc = torch.utils.data.DataLoader(
        test_dataset_with_doc,
        batch_size=args.test_batch_size,
        sampler=test_sampler_with_doc,
        num_workers=args.data_workers,
        collate_fn=vector.batchify_with_docs,
        pin_memory=args.cuda,
    )

    # -------------------------------------------------------------------------
    # PRINT CONFIG
    logger.info('-' * 100)
    logger.info('CONFIG:\n%s' %
                json.dumps(vars(args), indent=4, sort_keys=True))

    # --------------------------------------------------------------------------
    # TRAIN/VALID LOOP
    logger.info('-' * 100)
    logger.info('Starting training...')
    stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0}

    for epoch in range(start_epoch, args.num_epochs):
        stats['epoch'] = epoch

        # Train
        if (args.mode == 'all'):
            train(args, train_loader_with_doc, model, stats,
                  train_exs_with_doc, train_docs)
        if (args.mode == 'reader'):
            pretrain_reader(args, train_loader_with_doc, model, stats,
                            train_exs_with_doc, train_docs)
        if (args.mode == 'selector'):
            pretrain_ranker(args, train_loader_with_doc, model, stats,
                            train_exs_with_doc, train_docs)

        result = validate_unofficial_with_doc(args, dev_loader_with_doc, model,
                                              stats, dev_exs_with_doc,
                                              dev_docs, 'dev')
        validate_unofficial_with_doc(args, train_loader_with_doc, model, stats,
                                     train_exs_with_doc, train_docs, 'train')
        if (dataset == 'webquestions' or dataset == 'CuratedTrec'):
            result = validate_unofficial_with_doc(args, test_loader_with_doc,
                                                  model, stats,
                                                  test_exs_with_doc, test_docs,
                                                  'test')
        else:
            validate_unofficial_with_doc(args, test_loader_with_doc, model,
                                         stats, test_exs_with_doc, test_docs,
                                         'test')
        if result[args.valid_metric] > stats['best_valid']:
            logger.info('Best valid: %s = %.2f (epoch %d, %d updates)' %
                        (args.valid_metric, result[args.valid_metric],
                         stats['epoch'], model.updates))
            model.save(args.model_file)
            stats['best_valid'] = result[args.valid_metric]
示例#9
0
    def run(self):
        '''
        Bind the pub and pull sockets for events
        '''
        salt.utils.appendproctitle(self.__class__.__name__)
        # Set up the context
        self.context = zmq.Context(1)
        # Prepare the master event publisher
        self.epub_sock = self.context.socket(zmq.PUB)
        try:
            self.epub_sock.setsockopt(zmq.HWM,
                                      self.opts['event_publisher_pub_hwm'])
        except AttributeError:
            self.epub_sock.setsockopt(zmq.SNDHWM,
                                      self.opts['event_publisher_pub_hwm'])
            self.epub_sock.setsockopt(zmq.RCVHWM,
                                      self.opts['event_publisher_pub_hwm'])
        # Prepare master event pull socket
        self.epull_sock = self.context.socket(zmq.PULL)
        if self.opts['ipc_mode'] == 'tcp':
            epub_uri = 'tcp://127.0.0.1:{0}'.format(
                self.opts['tcp_master_pub_port'])
            epull_uri = 'tcp://127.0.0.1:{0}'.format(
                self.opts['tcp_master_pull_port'])
        else:
            epub_uri = 'ipc://{0}'.format(
                os.path.join(self.opts['sock_dir'], 'master_event_pub.ipc'))
            salt.utils.zeromq.check_ipc_path_max_len(epub_uri)
            epull_uri = 'ipc://{0}'.format(
                os.path.join(self.opts['sock_dir'], 'master_event_pull.ipc'))
            salt.utils.zeromq.check_ipc_path_max_len(epull_uri)

        # Start the master event publisher
        old_umask = os.umask(0o177)
        try:
            self.epull_sock.bind(epull_uri)
            self.epub_sock.bind(epub_uri)
            if self.opts['client_acl'] or self.opts['client_acl_blacklist']:
                salt.utils.warn_until(
                    'Nitrogen',
                    'ACL rules should be configured with \'publisher_acl\' and '
                    '\'publisher_acl_blacklist\' not \'client_acl\' and '
                    '\'client_acl_blacklist\'. This functionality will be removed in Salt '
                    'Nitrogen.')
            if (self.opts['ipc_mode'] != 'tcp'
                    and (self.opts['publisher_acl'] or self.opts['client_acl']
                         or self.opts['external_auth'])):
                os.chmod(
                    os.path.join(self.opts['sock_dir'],
                                 'master_event_pub.ipc'), 0o666)
        finally:
            os.umask(old_umask)

        # Make sure the ZMQ context and respective sockets are closed and
        # destroyed
        Finalize(self, self.destroy_zmq_context, exitpriority=15)

        while True:
            # Catch and handle EINTR from when this process is sent
            # SIGUSR1 gracefully so we don't choke and die horribly
            try:
                package = self.epull_sock.recv()
                self.epub_sock.send(package)
            except zmq.ZMQError as exc:
                if exc.errno == errno.EINTR:
                    continue
                raise exc
示例#10
0
def init(tokenizer_class, options):
    global TOK
    # Finalize: is responsible for adding a callback to the registry.
    TOK = tokenizer_class(**options)
    Finalize(TOK, TOK.shutdown, exitpriority=100)
示例#11
0
def init(tokenizer_class, db):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = tokenizer_class()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = db
示例#12
0
def init(top_k):
    global PROCESS_DB, PROCESS_RANKER, TOP_K
    PROCESS_DB = OldDocbDB(DRQA_DOC_DB)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
    PROCESS_RANKER = TfidfDocRanker(DRQA_RANKER, strict=False, tokenizer='simple')
    TOP_K = top_k
    def __init__(self,
                 concurrency=None,
                 logfile=None,
                 loglevel=None,
                 send_events=conf.SEND_EVENTS,
                 hostname=None,
                 ready_callback=noop,
                 embed_clockservice=False,
                 pool_cls=conf.CELERYD_POOL,
                 listener_cls=conf.CELERYD_LISTENER,
                 mediator_cls=conf.CELERYD_MEDIATOR,
                 eta_scheduler_cls=conf.CELERYD_ETA_SCHEDULER,
                 schedule_filename=conf.CELERYBEAT_SCHEDULE_FILENAME,
                 task_time_limit=conf.CELERYD_TASK_TIME_LIMIT,
                 task_soft_time_limit=conf.CELERYD_TASK_SOFT_TIME_LIMIT,
                 max_tasks_per_child=conf.CELERYD_MAX_TASKS_PER_CHILD,
                 pool_putlocks=conf.CELERYD_POOL_PUTLOCKS,
                 db=conf.CELERYD_STATE_DB):

        # Options
        self.loglevel = loglevel or self.loglevel
        self.concurrency = concurrency or self.concurrency
        self.logfile = logfile or self.logfile
        self.logger = setup_logger(loglevel, logfile)
        self.hostname = hostname or socket.gethostname()
        self.embed_clockservice = embed_clockservice
        self.ready_callback = ready_callback
        self.send_events = send_events
        self.task_time_limit = task_time_limit
        self.task_soft_time_limit = task_soft_time_limit
        self.max_tasks_per_child = max_tasks_per_child
        self.pool_putlocks = pool_putlocks
        self.db = db
        self._finalize = Finalize(self, self.stop, exitpriority=1)

        if self.db:
            persistence = state.Persistent(self.db)
            Finalize(persistence, persistence.save, exitpriority=5)

        # Queues
        if conf.DISABLE_RATE_LIMITS:
            self.ready_queue = FastQueue()
        else:
            self.ready_queue = TaskBucket(task_registry=registry.tasks)
        self.eta_schedule = Scheduler(self.ready_queue, logger=self.logger)

        self.logger.debug("Instantiating thread components...")

        # Threads + Pool + Consumer
        self.pool = instantiate(pool_cls,
                                self.concurrency,
                                logger=self.logger,
                                initializer=process_initializer,
                                maxtasksperchild=self.max_tasks_per_child,
                                timeout=self.task_time_limit,
                                soft_timeout=self.task_soft_time_limit,
                                putlocks=self.pool_putlocks)
        self.mediator = instantiate(mediator_cls,
                                    self.ready_queue,
                                    callback=self.process_task,
                                    logger=self.logger)
        self.scheduler = instantiate(eta_scheduler_cls,
                                     self.eta_schedule,
                                     logger=self.logger)

        self.clockservice = None
        if self.embed_clockservice:
            self.clockservice = EmbeddedClockService(
                logger=self.logger, schedule_filename=schedule_filename)

        prefetch_count = self.concurrency * conf.CELERYD_PREFETCH_MULTIPLIER
        self.listener = instantiate(listener_cls,
                                    self.ready_queue,
                                    self.eta_schedule,
                                    logger=self.logger,
                                    hostname=self.hostname,
                                    send_events=self.send_events,
                                    init_callback=self.ready_callback,
                                    initial_prefetch_count=prefetch_count,
                                    pool=self.pool)

        # The order is important here;
        #   the first in the list is the first to start,
        # and they must be stopped in reverse order.
        self.components = filter(None,
                                 (self.pool, self.mediator, self.scheduler,
                                  self.clockservice, self.listener))
示例#14
0
def init_wiki2para(db_class, db_opts):
    global PROCESS_WIKI2PARA_DB
    PROCESS_WIKI2PARA_DB = db_class(**db_opts)
    Finalize(PROCESS_WIKI2PARA_DB,
             PROCESS_WIKI2PARA_DB.close,
             exitpriority=100)
示例#15
0
def pool_init(tokenizer_opts):
    global TOKENIZER, DB
    TOKENIZER = CoreNlpTokenizer(**tokenizer_opts)
    Finalize(TOKENIZER, TOKENIZER.close, exitpriority=100)
    DB = Db()
    Finalize(DB, DB.close, exitpriority=100)
    def __init__(self, msis, processes=None, kwargs=None):
        '''
        
        Parameters
        ----------
        msis : list 
               iterable of model structure interface instances
        processes: int
                   nr. of processes to spawn, if none, it is set to equal the 
                   nr. of cores
        kwargs : dict
                 kwargs to be pased to :meth:`model_init`
        '''

        if processes is None:
            try:
                processes = multiprocessing.cpu_count()
            except NotImplementedError:
                processes = 1
        ema_logging.info("nr of processes is " + str(processes))

        # setup queues etc.
        self._setup_queues()
        self._taskqueue = queue.Queue(processes * 2)
        self._cache = {}
        self._state = pool.RUN

        # handling of logging
        self.log_queue = multiprocessing.Queue()
        h = ema_logging.NullHandler()
        logging.getLogger(ema_logging.LOGGER_NAME).addHandler(h)

        log_queue_reader = LogQueueReader(self.log_queue)
        log_queue_reader.start()

        # setup of the actual pool
        self._pool = []
        working_dirs = []

        ema_logging.debug('generating workers')

        worker_root = None
        for i in range(processes):
            # consider adding a progress bar if we need to setup
            # many processes including substantial copying

            ema_logging.debug('generating worker ' + str(i))

            workername = self._get_worker_name(i)

            #setup working directories for parallel_ema
            for msi in msis:
                if msi.working_directory != None:
                    if worker_root == None:
                        wd = msis[0].working_directory
                        abs_wd = os.path.abspath(wd)
                        worker_root = os.path.dirname(abs_wd)

                    wd_name = workername + msi.name
                    working_directory = os.path.join(worker_root, wd_name)

                    working_dirs.append(working_directory)
                    shutil.copytree(
                        msi.working_directory,
                        working_directory,
                    )
                    msi.set_working_directory(working_directory)


#             w = multiprocessing.Process(target=worker,
#                                         args=(self._inqueue,
#                                               self._outqueue,
#                                               msis,
#                                               kwargs)
#                                         )
            w = LoggingProcess(
                self.log_queue,
                level=logging.getLogger(
                    ema_logging.LOGGER_NAME).getEffectiveLevel(),
                target=worker,
                args=(self._inqueue, self._outqueue, msis, kwargs))
            self._pool.append(w)

            w.name = w.name.replace('Process', workername)
            w.daemon = True
            w.start()
            ema_logging.debug(' worker ' + str(i) + ' generated')

        # thread for handling tasks
        self._task_handler = threading.Thread(
            target=CalculatorPool._handle_tasks,
            name='task handler',
            args=(self._taskqueue, self._quick_put, self._outqueue,
                  self._pool))
        self._task_handler.daemon = True
        self._task_handler._state = pool.RUN
        self._task_handler.start()

        # thread for handling results
        self._result_handler = threading.Thread(
            target=CalculatorPool._handle_results,
            name='result handler',
            args=(self._outqueue, self._quick_get, self._cache,
                  self.log_queue))
        self._result_handler.daemon = True
        self._result_handler._state = pool.RUN
        self._result_handler.start()

        # function for cleaning up when finalizing object
        self._terminate = Finalize(self,
                                   self._terminate_pool,
                                   args=(
                                       self._taskqueue,
                                       self._inqueue,
                                       self._outqueue,
                                       self._pool,
                                       self._task_handler,
                                       self._result_handler,
                                       self._cache,
                                       working_dirs,
                                   ),
                                   exitpriority=15)

        ema_logging.info("pool has been set up")
示例#17
0
 def __init__(self, size):
     block = BufferWrapper._heap.malloc(size)
     self._state = (block, size)
     Finalize(self, BufferWrapper._heap.free, args=(block, ))
示例#18
0
 def __init__(self, size):
     raise 0 <= size < sys.maxint or AssertionError
     block = BufferWrapper._heap.malloc(size)
     self._state = (block, size)
     Finalize(self, BufferWrapper._heap.free, args=(block, ))
示例#19
0
    def __init__(self,
                 processes=None,
                 initializer=None,
                 initargs=(),
                 maxtasksperchild=None,
                 timeout=None,
                 soft_timeout=None):
        self._setup_queues()
        self._taskqueue = Queue.Queue()
        self._cache = {}
        self._state = RUN
        self.timeout = timeout
        self.soft_timeout = soft_timeout
        self._maxtasksperchild = maxtasksperchild
        self._initializer = initializer
        self._initargs = initargs

        if self.soft_timeout and SIG_SOFT_TIMEOUT is None:
            raise NotImplementedError(
                "Soft timeouts not supported: "
                "Your platform does not have the SIGUSR1 signal.")

        if processes is None:
            try:
                processes = cpu_count()
            except NotImplementedError:
                processes = 1
        self._processes = processes

        if initializer is not None and not hasattr(initializer, '__call__'):
            raise TypeError('initializer must be a callable')

        self._pool = []
        for i in range(processes):
            self._create_worker_process()

        self._worker_handler = self.Supervisor(self)
        self._worker_handler.start()

        self._putlock = LaxBoundedSemaphore(self._processes)
        self._task_handler = self.TaskHandler(self._taskqueue, self._quick_put,
                                              self._outqueue, self._pool)
        self._task_handler.start()

        # Thread killing timedout jobs.
        if self.timeout or self.soft_timeout:
            self._timeout_handler = self.TimeoutHandler(
                self._pool, self._cache, self.soft_timeout, self.timeout,
                self._putlock)
            self._timeout_handler.start()
        else:
            self._timeout_handler = None

        # Thread processing results in the outqueue.
        self._result_handler = self.ResultHandler(self._outqueue,
                                                  self._quick_get, self._cache,
                                                  self._poll_result,
                                                  self._join_exited_workers,
                                                  self._putlock)
        self._result_handler.start()

        self._terminate = Finalize(
            self,
            self._terminate_pool,
            args=(self._taskqueue, self._inqueue, self._outqueue, self._pool,
                  self._worker_handler, self._task_handler,
                  self._result_handler, self._cache, self._timeout_handler),
            exitpriority=15,
        )
示例#20
0
文件: heap.py 项目: 415ec080/finalPrj
 def __init__(self, size):
     assert 0 <= size < sys.maxint
     block = BufferWrapper._heap.malloc(size)
     self._state = (block, size)
     Finalize(self, BufferWrapper._heap.free, args=(block,))
示例#21
0
    def __init__(self,
                 processes=None,
                 initializer=None,
                 initargs=(),
                 maxtasksperchild=None):
        self._setup_queues()
        self._taskqueue = Queue.Queue()  # 用于分发存储用户输入的任务
        self._cache = {}  #  Pool 实例和 ApplyResult 实例共享数据,用于存储任务以及任务结果
        self._state = RUN  # 标识主进程状态
        self._maxtasksperchild = maxtasksperchild  # 指定每个 worker 进程最多的处理任务数
        self._initializer = initializer  # 启动每个 worker 进程后执行的函数
        self._initargs = initargs  # _initializer 函数的参数

        if processes is None:
            try:
                processes = cpu_count()
            except NotImplementedError:
                processes = 1
        if processes < 1:
            raise ValueError("Number of processes must be at least 1")

        if initializer is not None and not hasattr(initializer, '__call__'):
            raise TypeError('initializer must be a callable')

        self._processes = processes
        self._pool = []
        """
        w = self.Process()
        self._pool.append(w)
        """
        self._repopulate_pool()

        self._worker_handler = threading.Thread(target=Pool._handle_workers,
                                                args=(self, ))
        self._worker_handler.daemon = True
        self._worker_handler._state = RUN
        self._worker_handler.start()

        self._task_handler = threading.Thread(
            target=Pool._handle_tasks,
            args=(self._taskqueue, self._quick_put, self._outqueue, self._pool,
                  self._cache))
        self._task_handler.daemon = True
        self._task_handler._state = RUN
        self._task_handler.start()

        self._result_handler = threading.Thread(target=Pool._handle_results,
                                                args=(self._outqueue,
                                                      self._quick_get,
                                                      self._cache))
        self._result_handler.daemon = True
        self._result_handler._state = RUN
        self._result_handler.start()

        self._terminate = Finalize(
            self,
            self._terminate_pool,
            args=(self._taskqueue, self._inqueue, self._outqueue, self._pool,
                  self._worker_handler, self._task_handler,
                  self._result_handler, self._cache),
            exitpriority=15)
示例#22
0
def pool_init(tokenizer_opts):
    global TOKENIZER
    TOKENIZER = CoreNlpTokenizer(**tokenizer_opts)
    # close不能加括号,这里传入的是一个方法,而不是方法结果
    Finalize(TOKENIZER, TOKENIZER.close, exitpriority=100)
示例#23
0
    def __init__(self,
                 processes=None,
                 initializer=None,
                 initargs=(),
                 worker_names=None,
                 maxtasksperchild=None,
                 max_queued_tasks=0):
        """
        Overriding this method in order to:
            * Name the pool worker threads
            * Name the threads used for managing the Pool internals
        """
        self.Process = partial(DaemonProcess, name=worker_names)

        self.worker_names = worker_names
        self._setup_queues(max_queued_tasks)
        self._taskqueue = Queue.Queue()
        self._cache = {}
        self._state = RUN
        self._maxtasksperchild = maxtasksperchild
        self._initializer = initializer
        self._initargs = initargs

        if processes is None:
            try:
                processes = cpu_count()
            except NotImplementedError:
                processes = 1
        if processes < 1:
            raise ValueError("Number of processes must be at least 1")

        if initializer is not None and not hasattr(initializer, '__call__'):
            raise TypeError('initializer must be a callable')

        self._processes = processes
        self._pool = []
        self._repopulate_pool()

        self._worker_handler = threading.Thread(target=Pool._handle_workers,
                                                args=(self, ),
                                                name='PoolWorkerHandler')
        self._worker_handler.daemon = True
        self._worker_handler._state = RUN
        self._worker_handler.start()

        self._task_handler = threading.Thread(
            target=Pool._handle_tasks,
            args=(self._taskqueue, self._quick_put, self._outqueue, self._pool,
                  self._cache),
            name='PoolTaskHandler')
        self._task_handler.daemon = True
        self._task_handler._state = RUN
        self._task_handler.start()

        self._result_handler = threading.Thread(target=Pool._handle_results,
                                                args=(self._outqueue,
                                                      self._quick_get,
                                                      self._cache),
                                                name='PoolResultHandler')
        self._result_handler.daemon = True
        self._result_handler._state = RUN
        self._result_handler.start()

        self._terminate = Finalize(
            self,
            self._terminate_pool,
            args=(self._taskqueue, self._inqueue, self._outqueue, self._pool,
                  self._worker_handler, self._task_handler,
                  self._result_handler, self._cache),
            exitpriority=15)
示例#24
0
    # get the closest docs for each question.
    logger.info('Initializing ranker...')
    ranker = retriever.get_class('tfidf')(tfidf_path=args.tfidf)

    logger.info('Ranking...')
    closest_docs = ranker.batch_closest_docs(questions,
                                             k=args.n_docs,
                                             num_workers=args.num_workers)

    tok_class = tokenizers.get_class(args.tokenizer)
    tok_opts = {}
    db_class = retriever.DocDB
    db_opts = {'db_path': args.doc_db}
    PROCESS_TOK = tok_class(**tok_opts)
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = db_class(**db_opts)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)

    del ranker

    logger.info('Initializing Reader...')
    reader = Reader(args.reader_model_type, args.reader_path,
                    args.reader_output_dir)
    reader.load_model()
    tokenizer = reader.get_tokenizer()

    logger.info('Splittind documents into passages...')
    documents = []
    docs_per_queston = []
    for doc_ids, _ in closest_docs:
示例#25
0
def init(tokenizer_class, options):
    global TOK
    TOK = tokenizer_class(**options)
    Finalize(TOK, TOK.shutdown, exitpriority=100)
示例#26
0
 def __init__(self, rfs):
     self.rfs = rfs
     self.rfs.enter_chroot()
     self.finalizer = Finalize(self, self.rfs.leave_chroot, exitpriority=10)
def init(db_class, db_opts, tok_class, tok_opts):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_DB = db_class(**db_opts)
    PROCESS_TOK = tok_class(**tok_opts)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
示例#28
0
sys_dir = './'
sys.path.append(sys_dir)

import json
import tokenizers
from multiprocessing.util import Finalize
tokenizers.set_default('corenlp_classpath', sys_dir + '/data/corenlp/*')

import string
import regex as re

tok_class = tokenizers.get_class("corenlp")
tok_opts = {}
PROCESS_TOK = tok_class(**tok_opts)
Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)


def tokenizer_text(my_str='', uncased=True):
    '''
    :param my_str: string
    :param cased: bool
    :return: list[str]
    '''
    text = unicodedata.normalize('NFD', my_str)
    answer = PROCESS_TOK.tokenize(text)

    if uncased == True:
        answer_word = answer.words(uncased=True)
    else:
        answer_word = answer.words(uncased=False)
示例#29
0
def init(tokenizer_class, db_class, db_opts):
    global PROCESS_TOK, PROCESS_DB
    PROCESS_TOK = tokenizer_class()
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_DB = db_class(**db_opts)
    Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
示例#30
0
def init(tokenizer_class, tokenizer_opts, candidates=None):
    global PROCESS_TOK, PROCESS_DB, PROCESS_CANDS
    PROCESS_TOK = tokenizer_class(**tokenizer_opts)
    Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
    PROCESS_CANDS = candidates