def init(options): global TOK TOK = SpacyTokenizer(**options) Finalize(TOK, TOK.shutdown, exitpriority=100)
def __init__(self, *args, **kwargs): self._dirty = set() self._finalize = Finalize(self, self.sync, exitpriority=5) Scheduler.__init__(self, *args, **kwargs) self.max_interval = 5
def init(tokenizer_class, annotators): global PROCESS_TOK PROCESS_TOK = tokenizer_class(annotators=annotators) Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
def init(): global PROCESS_TOK PROCESS_TOK = SimpleTokenizer() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
def init(): global PROCESS_TOK, PROCESS_DB PROCESS_TOK = CoreNLPTokenizer() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB = DocDB() Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
def init(tokenizer_class, tokenizer_opts): global PROCESS_TOK PROCESS_TOK = tokenizer_class(**tokenizer_opts) Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100)
def init(): global TOK TOK = SpacyTokenizer(annotators=ANNTOTORS) Finalize(TOK, TOK.shutdown, exitpriority=100)
def main(args): # -------------------------------------------------------------------------- # TOK global PROCESS_TOK tok_class = tokenizers.get_class("corenlp") tok_opts = {} PROCESS_TOK = tok_class(**tok_opts) Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) # DATA logger.info('-' * 100) logger.info('Load data files') dataset = args.dataset #'quasart'#'searchqa'#'unftriviaqa'#'squad'# filename_train_docs = sys_dir + "/data/datasets/" + dataset + "/train.json" filename_dev_docs = sys_dir + "/data/datasets/" + dataset + "/dev.json" filename_test_docs = sys_dir + "/data/datasets/" + dataset + "/test.json" train_docs, train_questions = utils.load_data_with_doc( args, filename_train_docs) logger.info(len(train_docs)) filename_train = sys_dir + "/data/datasets/" + dataset + "/train.txt" filename_dev = sys_dir + "/data/datasets/" + dataset + "/dev.txt" train_exs_with_doc = read_data(filename_train, train_questions) logger.info('Num train examples = %d' % len(train_exs_with_doc)) dev_docs, dev_questions = utils.load_data_with_doc(args, filename_dev_docs) logger.info(len(dev_docs)) dev_exs_with_doc = read_data(filename_dev, dev_questions) logger.info('Num dev examples = %d' % len(dev_exs_with_doc)) test_docs, test_questions = utils.load_data_with_doc( args, filename_test_docs) logger.info(len(test_docs)) test_exs_with_doc = read_data( sys_dir + "/data/datasets/" + dataset + "/test.txt", test_questions) logger.info('Num dev examples = %d' % len(test_exs_with_doc)) # -------------------------------------------------------------------------- # MODEL logger.info('-' * 100) start_epoch = 0 if args.checkpoint and os.path.isfile(args.model_file + '.checkpoint'): # Just resume training, no modifications. logger.info('Found a checkpoint...') checkpoint_file = args.model_file + '.checkpoint' model, start_epoch = DocReader.load_checkpoint(checkpoint_file) #model = DocReader.load(checkpoint_file, args) start_epoch = 0 else: # Training starts fresh. But the model state is either pretrained or # newly (randomly) initialized. if args.pretrained: logger.info('Using pretrained model...') model = DocReader.load(args.pretrained, args) if args.expand_dictionary: logger.info('Expanding dictionary for new data...') # Add words in training + dev examples words = utils.load_words(args, train_exs + dev_exs) added = model.expand_dictionary(words) # Load pretrained embeddings for added words if args.embedding_file: model.load_embeddings(added, args.embedding_file) else: logger.info('Training model from scratch...') model = init_from_scratch(args, train_docs) #, train_exs, dev_exs) # Set up optimizer model.init_optimizer() # Use the GPU? if args.cuda: model.cuda() # Use multiple GPUs? if args.parallel: model.parallelize() # -------------------------------------------------------------------------- # DATA ITERATORS # Two datasets: train and dev. If we sort by length it's faster. logger.info('-' * 100) logger.info('Make data loaders') train_dataset_with_doc = data.ReaderDataset_with_Doc(train_exs_with_doc, model, train_docs, single_answer=True) train_sampler_with_doc = torch.utils.data.sampler.SequentialSampler( train_dataset_with_doc) train_loader_with_doc = torch.utils.data.DataLoader( train_dataset_with_doc, batch_size=args.batch_size, sampler=train_sampler_with_doc, num_workers=args.data_workers, collate_fn=vector.batchify_with_docs, pin_memory=args.cuda, ) dev_dataset_with_doc = data.ReaderDataset_with_Doc(dev_exs_with_doc, model, dev_docs, single_answer=False) dev_sampler_with_doc = torch.utils.data.sampler.SequentialSampler( dev_dataset_with_doc) dev_loader_with_doc = torch.utils.data.DataLoader( dev_dataset_with_doc, batch_size=args.test_batch_size, sampler=dev_sampler_with_doc, num_workers=args.data_workers, collate_fn=vector.batchify_with_docs, pin_memory=args.cuda, ) test_dataset_with_doc = data.ReaderDataset_with_Doc(test_exs_with_doc, model, test_docs, single_answer=False) test_sampler_with_doc = torch.utils.data.sampler.SequentialSampler( test_dataset_with_doc) test_loader_with_doc = torch.utils.data.DataLoader( test_dataset_with_doc, batch_size=args.test_batch_size, sampler=test_sampler_with_doc, num_workers=args.data_workers, collate_fn=vector.batchify_with_docs, pin_memory=args.cuda, ) # ------------------------------------------------------------------------- # PRINT CONFIG logger.info('-' * 100) logger.info('CONFIG:\n%s' % json.dumps(vars(args), indent=4, sort_keys=True)) # -------------------------------------------------------------------------- # TRAIN/VALID LOOP logger.info('-' * 100) logger.info('Starting training...') stats = {'timer': utils.Timer(), 'epoch': 0, 'best_valid': 0} for epoch in range(start_epoch, args.num_epochs): stats['epoch'] = epoch # Train if (args.mode == 'all'): train(args, train_loader_with_doc, model, stats, train_exs_with_doc, train_docs) if (args.mode == 'reader'): pretrain_reader(args, train_loader_with_doc, model, stats, train_exs_with_doc, train_docs) if (args.mode == 'selector'): pretrain_ranker(args, train_loader_with_doc, model, stats, train_exs_with_doc, train_docs) result = validate_unofficial_with_doc(args, dev_loader_with_doc, model, stats, dev_exs_with_doc, dev_docs, 'dev') validate_unofficial_with_doc(args, train_loader_with_doc, model, stats, train_exs_with_doc, train_docs, 'train') if (dataset == 'webquestions' or dataset == 'CuratedTrec'): result = validate_unofficial_with_doc(args, test_loader_with_doc, model, stats, test_exs_with_doc, test_docs, 'test') else: validate_unofficial_with_doc(args, test_loader_with_doc, model, stats, test_exs_with_doc, test_docs, 'test') if result[args.valid_metric] > stats['best_valid']: logger.info('Best valid: %s = %.2f (epoch %d, %d updates)' % (args.valid_metric, result[args.valid_metric], stats['epoch'], model.updates)) model.save(args.model_file) stats['best_valid'] = result[args.valid_metric]
def run(self): ''' Bind the pub and pull sockets for events ''' salt.utils.appendproctitle(self.__class__.__name__) # Set up the context self.context = zmq.Context(1) # Prepare the master event publisher self.epub_sock = self.context.socket(zmq.PUB) try: self.epub_sock.setsockopt(zmq.HWM, self.opts['event_publisher_pub_hwm']) except AttributeError: self.epub_sock.setsockopt(zmq.SNDHWM, self.opts['event_publisher_pub_hwm']) self.epub_sock.setsockopt(zmq.RCVHWM, self.opts['event_publisher_pub_hwm']) # Prepare master event pull socket self.epull_sock = self.context.socket(zmq.PULL) if self.opts['ipc_mode'] == 'tcp': epub_uri = 'tcp://127.0.0.1:{0}'.format( self.opts['tcp_master_pub_port']) epull_uri = 'tcp://127.0.0.1:{0}'.format( self.opts['tcp_master_pull_port']) else: epub_uri = 'ipc://{0}'.format( os.path.join(self.opts['sock_dir'], 'master_event_pub.ipc')) salt.utils.zeromq.check_ipc_path_max_len(epub_uri) epull_uri = 'ipc://{0}'.format( os.path.join(self.opts['sock_dir'], 'master_event_pull.ipc')) salt.utils.zeromq.check_ipc_path_max_len(epull_uri) # Start the master event publisher old_umask = os.umask(0o177) try: self.epull_sock.bind(epull_uri) self.epub_sock.bind(epub_uri) if self.opts['client_acl'] or self.opts['client_acl_blacklist']: salt.utils.warn_until( 'Nitrogen', 'ACL rules should be configured with \'publisher_acl\' and ' '\'publisher_acl_blacklist\' not \'client_acl\' and ' '\'client_acl_blacklist\'. This functionality will be removed in Salt ' 'Nitrogen.') if (self.opts['ipc_mode'] != 'tcp' and (self.opts['publisher_acl'] or self.opts['client_acl'] or self.opts['external_auth'])): os.chmod( os.path.join(self.opts['sock_dir'], 'master_event_pub.ipc'), 0o666) finally: os.umask(old_umask) # Make sure the ZMQ context and respective sockets are closed and # destroyed Finalize(self, self.destroy_zmq_context, exitpriority=15) while True: # Catch and handle EINTR from when this process is sent # SIGUSR1 gracefully so we don't choke and die horribly try: package = self.epull_sock.recv() self.epub_sock.send(package) except zmq.ZMQError as exc: if exc.errno == errno.EINTR: continue raise exc
def init(tokenizer_class, options): global TOK # Finalize: is responsible for adding a callback to the registry. TOK = tokenizer_class(**options) Finalize(TOK, TOK.shutdown, exitpriority=100)
def init(tokenizer_class, db): global PROCESS_TOK, PROCESS_DB PROCESS_TOK = tokenizer_class() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB = db
def init(top_k): global PROCESS_DB, PROCESS_RANKER, TOP_K PROCESS_DB = OldDocbDB(DRQA_DOC_DB) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100) PROCESS_RANKER = TfidfDocRanker(DRQA_RANKER, strict=False, tokenizer='simple') TOP_K = top_k
def __init__(self, concurrency=None, logfile=None, loglevel=None, send_events=conf.SEND_EVENTS, hostname=None, ready_callback=noop, embed_clockservice=False, pool_cls=conf.CELERYD_POOL, listener_cls=conf.CELERYD_LISTENER, mediator_cls=conf.CELERYD_MEDIATOR, eta_scheduler_cls=conf.CELERYD_ETA_SCHEDULER, schedule_filename=conf.CELERYBEAT_SCHEDULE_FILENAME, task_time_limit=conf.CELERYD_TASK_TIME_LIMIT, task_soft_time_limit=conf.CELERYD_TASK_SOFT_TIME_LIMIT, max_tasks_per_child=conf.CELERYD_MAX_TASKS_PER_CHILD, pool_putlocks=conf.CELERYD_POOL_PUTLOCKS, db=conf.CELERYD_STATE_DB): # Options self.loglevel = loglevel or self.loglevel self.concurrency = concurrency or self.concurrency self.logfile = logfile or self.logfile self.logger = setup_logger(loglevel, logfile) self.hostname = hostname or socket.gethostname() self.embed_clockservice = embed_clockservice self.ready_callback = ready_callback self.send_events = send_events self.task_time_limit = task_time_limit self.task_soft_time_limit = task_soft_time_limit self.max_tasks_per_child = max_tasks_per_child self.pool_putlocks = pool_putlocks self.db = db self._finalize = Finalize(self, self.stop, exitpriority=1) if self.db: persistence = state.Persistent(self.db) Finalize(persistence, persistence.save, exitpriority=5) # Queues if conf.DISABLE_RATE_LIMITS: self.ready_queue = FastQueue() else: self.ready_queue = TaskBucket(task_registry=registry.tasks) self.eta_schedule = Scheduler(self.ready_queue, logger=self.logger) self.logger.debug("Instantiating thread components...") # Threads + Pool + Consumer self.pool = instantiate(pool_cls, self.concurrency, logger=self.logger, initializer=process_initializer, maxtasksperchild=self.max_tasks_per_child, timeout=self.task_time_limit, soft_timeout=self.task_soft_time_limit, putlocks=self.pool_putlocks) self.mediator = instantiate(mediator_cls, self.ready_queue, callback=self.process_task, logger=self.logger) self.scheduler = instantiate(eta_scheduler_cls, self.eta_schedule, logger=self.logger) self.clockservice = None if self.embed_clockservice: self.clockservice = EmbeddedClockService( logger=self.logger, schedule_filename=schedule_filename) prefetch_count = self.concurrency * conf.CELERYD_PREFETCH_MULTIPLIER self.listener = instantiate(listener_cls, self.ready_queue, self.eta_schedule, logger=self.logger, hostname=self.hostname, send_events=self.send_events, init_callback=self.ready_callback, initial_prefetch_count=prefetch_count, pool=self.pool) # The order is important here; # the first in the list is the first to start, # and they must be stopped in reverse order. self.components = filter(None, (self.pool, self.mediator, self.scheduler, self.clockservice, self.listener))
def init_wiki2para(db_class, db_opts): global PROCESS_WIKI2PARA_DB PROCESS_WIKI2PARA_DB = db_class(**db_opts) Finalize(PROCESS_WIKI2PARA_DB, PROCESS_WIKI2PARA_DB.close, exitpriority=100)
def pool_init(tokenizer_opts): global TOKENIZER, DB TOKENIZER = CoreNlpTokenizer(**tokenizer_opts) Finalize(TOKENIZER, TOKENIZER.close, exitpriority=100) DB = Db() Finalize(DB, DB.close, exitpriority=100)
def __init__(self, msis, processes=None, kwargs=None): ''' Parameters ---------- msis : list iterable of model structure interface instances processes: int nr. of processes to spawn, if none, it is set to equal the nr. of cores kwargs : dict kwargs to be pased to :meth:`model_init` ''' if processes is None: try: processes = multiprocessing.cpu_count() except NotImplementedError: processes = 1 ema_logging.info("nr of processes is " + str(processes)) # setup queues etc. self._setup_queues() self._taskqueue = queue.Queue(processes * 2) self._cache = {} self._state = pool.RUN # handling of logging self.log_queue = multiprocessing.Queue() h = ema_logging.NullHandler() logging.getLogger(ema_logging.LOGGER_NAME).addHandler(h) log_queue_reader = LogQueueReader(self.log_queue) log_queue_reader.start() # setup of the actual pool self._pool = [] working_dirs = [] ema_logging.debug('generating workers') worker_root = None for i in range(processes): # consider adding a progress bar if we need to setup # many processes including substantial copying ema_logging.debug('generating worker ' + str(i)) workername = self._get_worker_name(i) #setup working directories for parallel_ema for msi in msis: if msi.working_directory != None: if worker_root == None: wd = msis[0].working_directory abs_wd = os.path.abspath(wd) worker_root = os.path.dirname(abs_wd) wd_name = workername + msi.name working_directory = os.path.join(worker_root, wd_name) working_dirs.append(working_directory) shutil.copytree( msi.working_directory, working_directory, ) msi.set_working_directory(working_directory) # w = multiprocessing.Process(target=worker, # args=(self._inqueue, # self._outqueue, # msis, # kwargs) # ) w = LoggingProcess( self.log_queue, level=logging.getLogger( ema_logging.LOGGER_NAME).getEffectiveLevel(), target=worker, args=(self._inqueue, self._outqueue, msis, kwargs)) self._pool.append(w) w.name = w.name.replace('Process', workername) w.daemon = True w.start() ema_logging.debug(' worker ' + str(i) + ' generated') # thread for handling tasks self._task_handler = threading.Thread( target=CalculatorPool._handle_tasks, name='task handler', args=(self._taskqueue, self._quick_put, self._outqueue, self._pool)) self._task_handler.daemon = True self._task_handler._state = pool.RUN self._task_handler.start() # thread for handling results self._result_handler = threading.Thread( target=CalculatorPool._handle_results, name='result handler', args=(self._outqueue, self._quick_get, self._cache, self.log_queue)) self._result_handler.daemon = True self._result_handler._state = pool.RUN self._result_handler.start() # function for cleaning up when finalizing object self._terminate = Finalize(self, self._terminate_pool, args=( self._taskqueue, self._inqueue, self._outqueue, self._pool, self._task_handler, self._result_handler, self._cache, working_dirs, ), exitpriority=15) ema_logging.info("pool has been set up")
def __init__(self, size): block = BufferWrapper._heap.malloc(size) self._state = (block, size) Finalize(self, BufferWrapper._heap.free, args=(block, ))
def __init__(self, size): raise 0 <= size < sys.maxint or AssertionError block = BufferWrapper._heap.malloc(size) self._state = (block, size) Finalize(self, BufferWrapper._heap.free, args=(block, ))
def __init__(self, processes=None, initializer=None, initargs=(), maxtasksperchild=None, timeout=None, soft_timeout=None): self._setup_queues() self._taskqueue = Queue.Queue() self._cache = {} self._state = RUN self.timeout = timeout self.soft_timeout = soft_timeout self._maxtasksperchild = maxtasksperchild self._initializer = initializer self._initargs = initargs if self.soft_timeout and SIG_SOFT_TIMEOUT is None: raise NotImplementedError( "Soft timeouts not supported: " "Your platform does not have the SIGUSR1 signal.") if processes is None: try: processes = cpu_count() except NotImplementedError: processes = 1 self._processes = processes if initializer is not None and not hasattr(initializer, '__call__'): raise TypeError('initializer must be a callable') self._pool = [] for i in range(processes): self._create_worker_process() self._worker_handler = self.Supervisor(self) self._worker_handler.start() self._putlock = LaxBoundedSemaphore(self._processes) self._task_handler = self.TaskHandler(self._taskqueue, self._quick_put, self._outqueue, self._pool) self._task_handler.start() # Thread killing timedout jobs. if self.timeout or self.soft_timeout: self._timeout_handler = self.TimeoutHandler( self._pool, self._cache, self.soft_timeout, self.timeout, self._putlock) self._timeout_handler.start() else: self._timeout_handler = None # Thread processing results in the outqueue. self._result_handler = self.ResultHandler(self._outqueue, self._quick_get, self._cache, self._poll_result, self._join_exited_workers, self._putlock) self._result_handler.start() self._terminate = Finalize( self, self._terminate_pool, args=(self._taskqueue, self._inqueue, self._outqueue, self._pool, self._worker_handler, self._task_handler, self._result_handler, self._cache, self._timeout_handler), exitpriority=15, )
def __init__(self, size): assert 0 <= size < sys.maxint block = BufferWrapper._heap.malloc(size) self._state = (block, size) Finalize(self, BufferWrapper._heap.free, args=(block,))
def __init__(self, processes=None, initializer=None, initargs=(), maxtasksperchild=None): self._setup_queues() self._taskqueue = Queue.Queue() # 用于分发存储用户输入的任务 self._cache = {} # Pool 实例和 ApplyResult 实例共享数据,用于存储任务以及任务结果 self._state = RUN # 标识主进程状态 self._maxtasksperchild = maxtasksperchild # 指定每个 worker 进程最多的处理任务数 self._initializer = initializer # 启动每个 worker 进程后执行的函数 self._initargs = initargs # _initializer 函数的参数 if processes is None: try: processes = cpu_count() except NotImplementedError: processes = 1 if processes < 1: raise ValueError("Number of processes must be at least 1") if initializer is not None and not hasattr(initializer, '__call__'): raise TypeError('initializer must be a callable') self._processes = processes self._pool = [] """ w = self.Process() self._pool.append(w) """ self._repopulate_pool() self._worker_handler = threading.Thread(target=Pool._handle_workers, args=(self, )) self._worker_handler.daemon = True self._worker_handler._state = RUN self._worker_handler.start() self._task_handler = threading.Thread( target=Pool._handle_tasks, args=(self._taskqueue, self._quick_put, self._outqueue, self._pool, self._cache)) self._task_handler.daemon = True self._task_handler._state = RUN self._task_handler.start() self._result_handler = threading.Thread(target=Pool._handle_results, args=(self._outqueue, self._quick_get, self._cache)) self._result_handler.daemon = True self._result_handler._state = RUN self._result_handler.start() self._terminate = Finalize( self, self._terminate_pool, args=(self._taskqueue, self._inqueue, self._outqueue, self._pool, self._worker_handler, self._task_handler, self._result_handler, self._cache), exitpriority=15)
def pool_init(tokenizer_opts): global TOKENIZER TOKENIZER = CoreNlpTokenizer(**tokenizer_opts) # close不能加括号,这里传入的是一个方法,而不是方法结果 Finalize(TOKENIZER, TOKENIZER.close, exitpriority=100)
def __init__(self, processes=None, initializer=None, initargs=(), worker_names=None, maxtasksperchild=None, max_queued_tasks=0): """ Overriding this method in order to: * Name the pool worker threads * Name the threads used for managing the Pool internals """ self.Process = partial(DaemonProcess, name=worker_names) self.worker_names = worker_names self._setup_queues(max_queued_tasks) self._taskqueue = Queue.Queue() self._cache = {} self._state = RUN self._maxtasksperchild = maxtasksperchild self._initializer = initializer self._initargs = initargs if processes is None: try: processes = cpu_count() except NotImplementedError: processes = 1 if processes < 1: raise ValueError("Number of processes must be at least 1") if initializer is not None and not hasattr(initializer, '__call__'): raise TypeError('initializer must be a callable') self._processes = processes self._pool = [] self._repopulate_pool() self._worker_handler = threading.Thread(target=Pool._handle_workers, args=(self, ), name='PoolWorkerHandler') self._worker_handler.daemon = True self._worker_handler._state = RUN self._worker_handler.start() self._task_handler = threading.Thread( target=Pool._handle_tasks, args=(self._taskqueue, self._quick_put, self._outqueue, self._pool, self._cache), name='PoolTaskHandler') self._task_handler.daemon = True self._task_handler._state = RUN self._task_handler.start() self._result_handler = threading.Thread(target=Pool._handle_results, args=(self._outqueue, self._quick_get, self._cache), name='PoolResultHandler') self._result_handler.daemon = True self._result_handler._state = RUN self._result_handler.start() self._terminate = Finalize( self, self._terminate_pool, args=(self._taskqueue, self._inqueue, self._outqueue, self._pool, self._worker_handler, self._task_handler, self._result_handler, self._cache), exitpriority=15)
# get the closest docs for each question. logger.info('Initializing ranker...') ranker = retriever.get_class('tfidf')(tfidf_path=args.tfidf) logger.info('Ranking...') closest_docs = ranker.batch_closest_docs(questions, k=args.n_docs, num_workers=args.num_workers) tok_class = tokenizers.get_class(args.tokenizer) tok_opts = {} db_class = retriever.DocDB db_opts = {'db_path': args.doc_db} PROCESS_TOK = tok_class(**tok_opts) Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB = db_class(**db_opts) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100) del ranker logger.info('Initializing Reader...') reader = Reader(args.reader_model_type, args.reader_path, args.reader_output_dir) reader.load_model() tokenizer = reader.get_tokenizer() logger.info('Splittind documents into passages...') documents = [] docs_per_queston = [] for doc_ids, _ in closest_docs:
def init(tokenizer_class, options): global TOK TOK = tokenizer_class(**options) Finalize(TOK, TOK.shutdown, exitpriority=100)
def __init__(self, rfs): self.rfs = rfs self.rfs.enter_chroot() self.finalizer = Finalize(self, self.rfs.leave_chroot, exitpriority=10)
def init(db_class, db_opts, tok_class, tok_opts): global PROCESS_TOK, PROCESS_DB PROCESS_DB = db_class(**db_opts) PROCESS_TOK = tok_class(**tok_opts) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
sys_dir = './' sys.path.append(sys_dir) import json import tokenizers from multiprocessing.util import Finalize tokenizers.set_default('corenlp_classpath', sys_dir + '/data/corenlp/*') import string import regex as re tok_class = tokenizers.get_class("corenlp") tok_opts = {} PROCESS_TOK = tok_class(**tok_opts) Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) def tokenizer_text(my_str='', uncased=True): ''' :param my_str: string :param cased: bool :return: list[str] ''' text = unicodedata.normalize('NFD', my_str) answer = PROCESS_TOK.tokenize(text) if uncased == True: answer_word = answer.words(uncased=True) else: answer_word = answer.words(uncased=False)
def init(tokenizer_class, db_class, db_opts): global PROCESS_TOK, PROCESS_DB PROCESS_TOK = tokenizer_class() Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_DB = db_class(**db_opts) Finalize(PROCESS_DB, PROCESS_DB.close, exitpriority=100)
def init(tokenizer_class, tokenizer_opts, candidates=None): global PROCESS_TOK, PROCESS_DB, PROCESS_CANDS PROCESS_TOK = tokenizer_class(**tokenizer_opts) Finalize(PROCESS_TOK, PROCESS_TOK.shutdown, exitpriority=100) PROCESS_CANDS = candidates