def test_include_real_paths(reset_globals): t1 = tempfile.NamedTemporaryFile() t2 = tempfile.NamedTemporaryFile() t3 = tempfile.NamedTemporaryFile() y1 = u''' t1: k3: !include_yaml %s k4: !include_yaml %s ''' % (t2.name, os.path.basename(t3.name)) print(y1) y2 = u'dog' y3 = u'two' t1.write(y1.encode('utf-8')) t2.write(y2.encode('utf-8')) t3.write(y3.encode('utf-8')) t1.flush() t2.flush() t3.flush() config = set_global_config(t1.name) assert get_global_config() is config print(config) sub_config = get_global_config('t1') assert sub_config is config['t1'] assert sub_config['k3'] == y2 assert sub_config['k4'] == y3
def test_kvlayer_simple(configurator, tmpdir): si = streamcorpus.make_stream_item('2000-01-01T12:34:00.000123Z', 'test://test.stream.item/') chunkfile = str(tmpdir.join('chunk.sc.xz')) with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk: chunk.add(si) with configurator(): writer = to_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'to_kvlayer')) writer(chunkfile, {}, '') kvlclient = kvlayer.client() kvlclient.setup_namespace({'stream_items': 2}) print repr(list(kvlclient.scan_keys('stream_items'))) for (k,v) in kvlclient.get( 'stream_items', (uuid.UUID(int=946730040), uuid.UUID(hex='985c1e3ed73256cd9a399919fe93cf76'))): assert v is not None reader = from_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'from_kvlayer')) sis = list(reader('')) assert len(sis) == 1 assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks assert sis[0].abs_url == si.abs_url
def test_normalize(): with yakonfig.defaulted_config([Normalized]): assert yakonfig.get_global_config('normalized')['k'] == 'v' with yakonfig.defaulted_config([Normalized], { 'k': 'foo' }): assert yakonfig.get_global_config('normalized')['k'] == 'f' with yakonfig.defaulted_config([Normalized], yaml=''' normalized: k: zoom! '''): assert yakonfig.get_global_config('normalized')['k'] == 'z'
def v1_folder_extract_post(fid, sid): conf = yakonfig.get_global_config('coordinate') tm = coordinate.TaskMaster(conf) key = cbor.dumps((fid, sid)) wu_status = tm.get_work_unit_status('ingest', key) if wu_status and wu_status['status'] in (AVAILABLE, BLOCKED, PENDING): return {'state': 'pending'} else: logger.info('launching async work unit for %r', (fid, sid)) conf = yakonfig.get_global_config('coordinate') tm = coordinate.TaskMaster(conf) tm.add_work_units('ingest', [(cbor.dumps((fid, sid)), {})]) return {'state': 'submitted'}
def main(): parser = argparse.ArgumentParser( 'Command line interface to the office TREC DD jig.', usage=usage, conflict_handler='resolve') parser.add_argument('command', help='must be "load", "init", "start", "step", or "stop"') parser.add_argument('args', help='input for given command', nargs=argparse.REMAINDER) modules = [yakonfig, kvlayer, Harness] args = yakonfig.parse_args(parser, modules) logging.basicConfig(level=logging.DEBUG) if args.command not in set(['load', 'init', 'start', 'step', 'stop']): sys.exit('The only valid commands are "load", "init", "start", "step", and "stop".') kvl = kvlayer.client() label_store = LabelStore(kvl) config = yakonfig.get_global_config('harness') harness = Harness(config, kvl, label_store) if args.command == 'load': if not config.get('truth_data_path'): sys.exit('Must provide --truth-data-path as an argument') if not os.path.exists(config['truth_data_path']): sys.exit('%r does not exist' % config['truth_data_path']) parse_truth_data(label_store, config['truth_data_path']) logger.info('Done! The truth data was loaded into this ' 'kvlayer backend:\n%s', json.dumps(yakonfig.get_global_config('kvlayer'), indent=4, sort_keys=True)) elif args.command == 'init': response = harness.init() print(json.dumps(response)) elif args.command == 'start': response = harness.start() print(json.dumps(response)) elif args.command == 'stop': response = harness.stop(args.args[0]) print(json.dumps(response)) elif args.command == 'step': parts = args.args topic_id = parts.pop(0) feedback = harness.step(topic_id, parts) print(json.dumps(feedback))
def test_include_abstract(reset_globals, monkeypatch_open): YAML_TEXT_TWO = StringIO(''' app_one: one: car app_two: bad: [cat, horse] good: !include /some-path-that-will-not-be-used ''') config = set_global_config(YAML_TEXT_TWO) assert get_global_config() is config sub_config = get_global_config('app_two') assert sub_config is config['app_two'] assert sub_config['good'] == dict(k1='v1', k2=['v21'])
def dragnet_status(): conf = yakonfig.get_global_config('coordinate') tm = coordinate.TaskMaster(conf) wu_status = tm.get_work_unit_status('dragnet', DRAGNET_KEY) if not wu_status: return None status = wu_status['status'] return status
def client(config=None, storage_type=None, *args, **kwargs): '''Create a kvlayer client object. With no arguments, gets the global :mod:`kvlayer` configuration from :mod:`yakonfig` and uses that. A `config` dictionary, if provided, is used in place of the :mod:`yakonfig` configuration. `storage_type` overrides the corresponding field in the configuration, but it must be supplied in one place or the other. Any additional parameters are passed to the corresponding backend's constructor. >>> local_storage = kvlayer.client(config={}, storage_type='local', ... app_name='app', namespace='ns') If there is additional configuration under the value of `storage_type`, that is overlaid over `config` and passed to the storage implementation. :param dict config: :mod:`kvlayer` configuration dictionary :param str storage_type: name of storage implementation :raise kvlayer._exceptions.ConfigurationError: if `storage_type` is not provided or is invalid ''' global _load_entry_point_kvlayer_impls_done if config is None: config = yakonfig.get_global_config('kvlayer') if storage_type is None: try: storage_type = config['storage_type'] except KeyError, exc: raise ConfigurationError( 'No storage_type in kvlayer configuration')
def main(): ap = argparse.ArgumentParser() ap.add_argument('--host', default=None, # NOT -h, that's help help='host that coordinated will listen on, ' '0.0.0.0 for any input interface') ap.add_argument('--port', '-p', type=int, default=None, help='port number that coordinated will listen on') ap.add_argument('--pid', default=None, help='file to write pid to') ap.add_argument('--snapshot-dir', default=None, help='direcotry to write snapshots to') ap.add_argument('--httpd', default=None, help='ip:port or :port to serve http info on') if yappi is not None: ap.add_argument('--yappi', default=None, help='file to write yappi profiling to. will be suffied by {timestamp}.txt') args = yakonfig.parse_args(ap, [yakonfig, dblogger, coordinate]) if args.pid: with open(args.pid, 'w') as f: f.write(str(os.getpid())) if args.snapshot_dir is not None: cjqconfig = yakonfig.get_global_config('coordinate', 'job_queue') # (This modifies the global configuration in place) cjqconfig['snapshot_path_format'] = os.path.join( args.snapshot_dir, 'snapshot_{timestamp}') if (yappi is not None) and args.yappi: yappi.start() yt = threading.Thread(target=yappi_logger, args=(args.yappi,)) yt.daemon = True yt.start() daemon = CoordinateServer(host=args.host, port=args.port, httpd=args.httpd) daemon.run()
def process_path(self, chunk_path): scp_config = yakonfig.get_global_config('streamcorpus_pipeline') tmp_dir = os.path.join(scp_config['tmp_dir_path'], str(uuid.uuid4())) os.mkdir(tmp_dir) par_file = self.config['par'] tagger_root_path = os.path.join(self.config['third_dir_path'], self.config['path_in_third']) par_path = self._write_config_par(tmp_dir, par_file, tagger_root_path) tmp_chunk_path = os.path.join(tmp_dir, 'output', os.path.basename(chunk_path)) cmd = [ os.path.join(tagger_root_path, self.config.get('serif_exe', 'bin/x86_64/Serif')), par_path, '-o', tmp_dir, chunk_path, ] logger.info('serif cmd: %r', cmd) start_time = time.time() ## make sure we are using as little memory as possible gc.collect() try: self._child = subprocess.Popen(cmd, stderr=subprocess.PIPE, shell=False) except OSError, exc: logger.error('error running serif cmd %r', cmd, exc_info=True) msg = traceback.format_exc(exc) msg += make_memory_info_msg() logger.critical(msg) raise
def test_pipeline(request, test_data_dir): filename=str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename): ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline stages = PipelineStages() pf = PipelineFactory(stages) p = pf(yakonfig.get_global_config('streamcorpus_pipeline')) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SystemExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) logger.debug('now joining...') timeout = gevent.Timeout(1) g.join(timeout=timeout)
def main(): p = argparse.ArgumentParser('simple debugging tool for watching the linker and OpenQuery') p.add_argument('action', help='either `run` or `cache` or `delete`') p.add_argument('folder', help='folder name') p.add_argument('subfolder', help='subfolder name') args = yakonfig.parse_args(p, [kvlayer, yakonfig]) config = yakonfig.get_global_config() key = cbor.dumps((args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_'))) if args.action == 'run': web_conf = Config() with yakonfig.defaulted_config([kvlayer, dblogger, web_conf], config=config): traverse_extract_fetch(web_conf, key, stop_after_extraction=True) elif args.action == 'delete': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str,)}) kvlclient.delete('openquery', (key,)) print('deleted %r' % key) elif args.action == 'cache': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str,)}) count = 0 for rec in kvlclient.scan('openquery'): count += 1 if rec[0][0] == key: print rec print('%d cached queries' % count)
def test_kvlayer_reader_and_writer(configurator, test_data_dir): with chunks(configurator, test_data_dir) as (path, client): ## check that index table was created all_doc_ids = set() all_epoch_ticks = set() for (doc_id, epoch_ticks), empty_data in client.scan('stream_items_doc_id_epoch_ticks'): all_doc_ids.add(doc_id) all_epoch_ticks.add(epoch_ticks) all_doc_ids = sorted(all_doc_ids) all_epoch_ticks = sorted(all_epoch_ticks) logger.info('%d doc_ids', len(all_doc_ids)) ## make an reader config = yakonfig.get_global_config('streamcorpus_pipeline', 'from_kvlayer') reader = from_kvlayer(config) ## test it with different i_str inputs: for i_str in ['', '0,,%d,' % 10**10, '%d,%s,%d,%s' % (all_epoch_ticks[0], all_doc_ids[0], all_epoch_ticks[-1], all_doc_ids[-1]) ]: stream_ids = [] for si in reader(i_str): stream_ids.append(si.stream_id) _input_chunk_ids = [si.stream_id for si in streamcorpus.Chunk(path)] input_chunk_ids = list(set(_input_chunk_ids)) logger.info('%d inserts, %d unique', len(_input_chunk_ids), len(input_chunk_ids)) input_chunk_ids.sort() stream_ids.sort() assert len(input_chunk_ids) == len(stream_ids) assert input_chunk_ids == stream_ids
def main(): p = argparse.ArgumentParser( 'simple debugging tool for watching the linker and OpenQuery') p.add_argument('action', help='either `run` or `cache` or `delete`') p.add_argument('folder', help='folder name') p.add_argument('subfolder', help='subfolder name') args = yakonfig.parse_args(p, [kvlayer, yakonfig]) config = yakonfig.get_global_config() key = cbor.dumps( (args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_'))) if args.action == 'run': web_conf = Config() with yakonfig.defaulted_config([kvlayer, dblogger, web_conf], config=config): traverse_extract_fetch(web_conf, key, stop_after_extraction=True) elif args.action == 'delete': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str, )}) kvlclient.delete('openquery', (key, )) print('deleted %r' % key) elif args.action == 'cache': kvlclient = kvlayer.client() kvlclient.setup_namespace({'openquery': (str, )}) count = 0 for rec in kvlclient.scan('openquery'): count += 1 if rec[0][0] == key: print rec print('%d cached queries' % count)
def __init__(self, host=None, port=None, config=None, httpd=None): self.config = config or yakonfig.get_global_config( 'coordinate', 'server') self.host = host or self._cfget('host') self.port = port or self._cfget('port') self.do_rpclog = self._cfget('rpclog') # TCPServer self.server = None # There is one server state instance which is the target # for all request handlers. # Methods should synchronize as needed and be thread safe. self.pobj = MultiBackendProxyObject( self.do_rpclog, module_config=self._cfget('modules'), module_instances=[JobQueue()] ) self.httpd_params = httpd or self._cfget('httpd') if self.httpd_params is not None and ':' not in self.httpd_params: raise ProgrammerError( 'httpd config needs ip:port or :port to serve on, got {!r}' .format(self.httpd_params)) self.httpd = None self.httpd_thread = None
def make_rejester_jobs(task_master, kvl, sources, work_spec_name): '''Create :mod:`rejester` jobs for inbound stream items. Each job runs :func:`rejester_run` in this module to run the elasticsearch index over a set of stream items. :param kvl: kvlayer client :type kvl: :class:`kvlayer._abstract_storage.AbstractStorage` :param list sources: source name strings to consider, or :const:`None` for all :param str work_spec_name: name of the rejester work spec ''' work_spec = { 'name': work_spec_name, 'desc': 'elasticsearch loader', 'min_gb': 1, 'config': yakonfig.get_global_config(), 'module': 'diffeo_search_tools.rejester_runner', 'run_function': 'rejester_run', 'terminate_function': 'rejester_terminate', } si_iter = _sid_iter(kvl, sources) # No value needed in following dict work_units = { key: 0 for key in si_iter } # work_units = { 'item': si_iter.next() } task_master.update_bundle(work_spec, work_units)
def test_pipeline(request, test_data_dir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename): ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline stages = PipelineStages() pf = PipelineFactory(stages) p = pf(yakonfig.get_global_config('streamcorpus_pipeline')) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SystemExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) logger.debug('now joining...') timeout = gevent.Timeout(1) g.join(timeout=timeout)
def main(options): """Run the recommender system on a sequence of topics. """ description = "System using LDA, Kmeans and Solr to optimize diversification and exploitation of different topics" parser = argparse.ArgumentParser(description=description) parser.add_argument("--overwrite", action="store_true") args = yakonfig.parse_args(parser, [yakonfig]) logging.basicConfig(level=logging.DEBUG) config = yakonfig.get_global_config("harness") batch_size = config.get("batch_size", 5) run_file_path = config["run_file_path"] if os.path.exists(run_file_path): if args.overwrite: os.remove(run_file_path) else: os.remove(run_file_path) # sys.exit('%r already exists' % run_file_path) kvl_config = {"storage_type": "local", "namespace": "test", "app_name": "test"} kvl = kvlayer.client(kvl_config) method, feedback_options, poids, id_config = options[0], options[1], options[2], options[3] print method, poids system = SearchSystem([], method, feedback_options, poids) print args.config args.config = "config" + str(id_config) + ".yaml" print args.config ambassador = HarnessAmbassadorCLI(system, args.config, batch_size) ambassador.run()
def v1_folder_extract_get(request, response, kvlclient, store, fid, sid): conf = yakonfig.get_global_config('coordinate') tm = coordinate.TaskMaster(conf) key = cbor.dumps((fid, sid)) wu_status = tm.get_work_unit_status('ingest', key) status = wu_status['status'] if status in (AVAILABLE, BLOCKED, PENDING): return {'state': 'pending'} elif status in (FINISHED,): kvlclient.setup_namespace({'openquery': (str,)}) data = None try: data = list(kvlclient.get('openquery', (key,))) assert len(data) == 1, data logger.info('got data of len 1: %r', data) assert data[0], data assert data[0][1], data data = data[0][1] data = json.loads(data) data['state'] = 'done' return data except: logger.info('kvlclient: %r', kvlclient) logger.error('Failed to get openquery data: %r', data, exc_info=True) return {'state': 'failed'} else: return {'state': 'failed'}
def test_archive_by_count(xconfig, jobqueue_conf): config = dict(yakonfig.get_global_config('coordinate', 'job_queue')) config['limit_completed_count'] = 2 config.update(jobqueue_conf) job_queue = JobQueue(config) if job_queue.postgres_connect_string: pytest.skip('TODO: postgres has not implemented archive by count') job_queue.set_work_spec({'name': 'ws1'}) job_queue.add_work_units('ws1', [('wu1', {'x': 1}), ('wu2', {'x': 1}), ('wu3', {'x': 1})]) # Bump all three work units to "finished" for wu in ['wu1', 'wu2', 'wu3']: wu_parts, msg = job_queue.get_work('id1', {}) assert wu_parts[0] == 'ws1' assert wu_parts[1] == wu job_queue.update_work_unit(wu_parts[0], wu_parts[1], {'status': FINISHED}) # Archiving hasn't happened, so we should see the finished count # is 3, and all three work units are there counts, msg = job_queue.count_work_units('ws1') assert counts[FINISHED] == 3 wus, msg = job_queue.get_work_units('ws1', {}) assert [wu[0] for wu in wus] == ['wu1', 'wu2', 'wu3'] job_queue.archive() # Now we should still see the same count, but the one that ran # first (wu1) is off the list counts, msg = job_queue.count_work_units('ws1') assert counts[FINISHED] == 3 wus, msg = job_queue.get_work_units('ws1', {}) assert [wu[0] for wu in wus] == ['wu2', 'wu3']
def as_child(cls, global_config, parent=None): '''Run a single job in a child process. This method never returns; it always calls :func:`sys.exit` with an error code that says what it did. ''' try: setproctitle('rejester worker') random.seed() # otherwise everyone inherits the same seed yakonfig.set_default_config([yakonfig, dblogger, rejester], config=global_config) worker = cls(yakonfig.get_global_config(rejester.config_name)) worker.register(parent=parent) did_work = worker.run(set_title=True) worker.unregister() if did_work: sys.exit(cls.EXIT_SUCCESS) else: sys.exit(cls.EXIT_BORED) except Exception, e: # There's some off chance we have logging. # You will be here if redis is down, for instance, # and the yakonfig dblogger setup runs but then # the get_work call fails with an exception. if len(logging.root.handlers) > 0: logger.critical('failed to do any work', exc_info=e) sys.exit(cls.EXIT_EXCEPTION)
def v1_folder_extract_get(request, response, kvlclient, store, fid, sid): conf = yakonfig.get_global_config('coordinate') tm = coordinate.TaskMaster(conf) key = cbor.dumps((fid, sid)) wu_status = tm.get_work_unit_status('ingest', key) status = wu_status['status'] if status in (AVAILABLE, BLOCKED, PENDING): return {'state': 'pending'} elif status in (FINISHED, ): kvlclient.setup_namespace({'openquery': (str, )}) data = None try: data = list(kvlclient.get('openquery', (key, ))) assert len(data) == 1, data logger.info('got data of len 1: %r', data) assert data[0], data assert data[0][1], data data = data[0][1] data = json.loads(data) data['state'] = 'done' return data except: logger.info('kvlclient: %r', kvlclient) logger.error('Failed to get openquery data: %r', data, exc_info=True) return {'state': 'failed'} else: return {'state': 'failed'}
def test_yakonfig_default(): yakonfig.set_default_config([yakonfig]) try: c = yakonfig.get_global_config() assert 'yakonfig' in c finally: yakonfig.clear_global_config()
def test_replaces_proxy(): with yakonfig.defaulted_config([ConfigurableLikeTop]): c = yakonfig.get_global_config() assert sorted(iterkeys(c)) == ['top'] c = c['top'] assert 'bottom' in c c = c['bottom'] assert c['zzz'] == '-32768'
def test_yakonfig_cli(): parser = argparse.ArgumentParser() yakonfig.parse_args(parser, [yakonfig], args=[]) try: c = yakonfig.get_global_config() assert 'yakonfig' in c finally: yakonfig.clear_global_config()
def configured(cls): '''Create a new instance from the global configuration. In order to use this, you must make sure that :class:`ElasticStore` has been configured by :mod:`yakonfig`, usually by passing the class to ``yakonfig.parse_args``. ''' return cls(**yakonfig.get_global_config('dossier.store'))
def test_kvlayer_negative(configurator, tmpdir): si = streamcorpus.make_stream_item('1969-07-20T20:18:00.000000Z', 'test://test.stream.item/') chunkfile = str(tmpdir.join('chunk.sc.xz')) with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk: chunk.add(si) with configurator(): writer = to_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'to_kvlayer')) writer(chunkfile, {}, '') reader = from_kvlayer(yakonfig.get_global_config( 'streamcorpus_pipeline', 'from_kvlayer')) sis = list(reader('')) assert len(sis) == 1 assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks assert sis[0].abs_url == si.abs_url
def add_flow(self, flow, config=None): '''Add a series of related work specs. `flow` is a dictionary, where the keys are work spec names and the values are either abbreviated work spec definitions or flow dictionaries that could be recursively passed into this function. Each work spec is amended by adding `name` based on its key and ancestry, and by adding `config` as either the `config` parameter or the current global configuration if :const:`None`. If a given work spec contains a `config` parameter, that parameter is overlaid over the provided configuration for that specific work spec. Thus, a work spec may contain a partial configuration for a specific setup of :mod:`streamcorpus_pipeline`, and the global configuration can contain shared settings for :mod:`kvlayer`. :param dict flow: work spec or dictionary of work specs :param dict config: global config to save with work specs :see: :func:`yakonfig.overlay_config` ''' # default to the global config if config is None: config = yakonfig.get_global_config() # collect all work specs and trap errors before submitting any work_specs = {} worklist = flow.items() while worklist: (k, v) = worklist.pop() if not isinstance(v, collections.Mapping): raise ProgrammerError('invalid work spec flow definition') if 'min_gb' in v: # d is a work spec v['name'] = k if 'config' in v: v['config'] = yakonfig.overlay_config(config, v['config']) else: v['config'] = config work_specs[k] = v else: for kk, vv in v.iteritems(): worklist.append((k + '.' + kk, vv)) # check that chaining is correct for k, v in work_specs.iteritems(): if 'then' in v: if v['then'] not in work_specs: raise ProgrammerError( 'work spec {} chained to invalid work spec {}' .format(k, v['then'])) # all good, submit them all for d in work_specs.itervalues(): self.set_work_spec(d)
def __init__(self, *args, **kwargs): # A horrible hack to create a new `Folders` instance with config. try: config = yakonfig.get_global_config('dossier.folders') # For old configs. if 'prefix' in config: config['namespace'] = config.pop('prefix') except KeyError: config = {} super(Folders, self).__init__(*args, **dict(config, **kwargs))
def v1_dragnet(): status = dragnet_status() if not status or status in (FINISHED, FAILED): logger.info('launching dragnet async work unit') conf = yakonfig.get_global_config('coordinate') tm = coordinate.TaskMaster(conf) tm.add_work_units('dragnet', [(DRAGNET_KEY, {})]) return {'state': 'submitted'} else: return {'state': 'pending'}
def test_cli_good(): parser = argparse.ArgumentParser() yakonfig.parse_args(parser, [ConfigurableArgs()], args=['--key', 'v']) try: c = yakonfig.get_global_config() assert 'config' in c assert 'k' in c['config'] assert c['config']['k'] == 'v' finally: yakonfig.clear_global_config()
def store(self): if self._store is None: feature_indexes = None try: conf = yakonfig.get_global_config("dossier.store") feature_indexes = conf["feature_indexes"] except KeyError: pass self._store = Store(kvlayer.client(), feature_indexes=feature_indexes) return self._store
def main(): p = argparse.ArgumentParser() args = yakonfig.parse_args(p, [kvlayer, yakonfig]) config = yakonfig.get_global_config() class Empty(object): pass e = Empty() e.spec = dict(config=config) worker(e)
def main(): conf = Autoconfig(an_object) parser = argparse.ArgumentParser() args = yakonfig.parse_args(parser, [conf]) config = yakonfig.get_global_config() print "The global configuration:" print config print obj = conf(config) print "The object:" print obj
def store(self): if self._store is None: feature_indexes = None try: conf = yakonfig.get_global_config('dossier.store') feature_indexes = conf['feature_indexes'] except KeyError: pass self._store = Store(kvlayer.client(), feature_indexes=feature_indexes) return self._store
def new_folders(kvlclient, request): try: config = yakonfig.get_global_config('memex_dossier.folders') # For old configs. if 'prefix' in config: config['namespace'] = config.pop('prefix') except KeyError: config = {} if 'annotator_id' in request.query: config['owner'] = request.query['annotator_id'] return Folders(kvlclient, **config)
def __init__(self, *args, **kwargs): super(to_dossier_store, self).__init__(*args, **kwargs) kvl = kvlayer.client() feature_indexes = None try: conf = yakonfig.get_global_config('dossier.store') feature_indexes = conf['feature_indexes'] except KeyError: pass self.store = Store(kvl, feature_indexes=feature_indexes) tfidf_path = self.config.get('tfidf_path') self.tfidf = gensim.models.TfidfModel.load(tfidf_path)
def main(): parser = argparse.ArgumentParser('test tool for checking that we can load ' 'the truth data as distributed by NIST for ' 'TREC 2015') parser.add_argument('truth_data_path', help='path to truth data file') modules = [yakonfig, kvlayer] args = yakonfig.parse_args(parser, modules) logging.basicConfig(level=logging.DEBUG) kvl = kvlayer.client() label_store = LabelStore(kvl) parse_truth_data(label_store, args.truth_data_path) logger.debug('Done! The truth data was loaded into this kvlayer backend: %r', json.dumps(yakonfig.get_global_config('kvlayer'), indent=4, sort_keys=True))
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config): """supply known-bad prefetched data""" with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = filename from_spinn3r_feed._prefetched[key] = 'bogus data, dude!' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 with pytest.raises(DecodeError): pipeline._process_task(work_unit)
def main(): import argparse parser = argparse.ArgumentParser( description='process a sequence of stream items', usage='streamcorpus_pipeline --config config.yaml --input file.in') parser.add_argument('-i', '--input', action='append', help='file paths to input instead of reading from stdin') parser.add_argument('--in-glob', action='append', default=[], help='path glob specifying input files') parser.add_argument('--third-dir-path', help='path to third-party tools directory') parser.add_argument('--tmp-dir-path', help='path to temporary directory for scratch files, can be large') modules = [yakonfig, kvlayer, dblogger, streamcorpus_pipeline] args = yakonfig.parse_args(parser, modules) config = yakonfig.get_global_config() ## this modifies the global config, passed by reference instantiate_config(config) input_paths = [] if args.in_glob: for pattern in args.in_glob: input_paths.extend(glob.glob(pattern)) if args.input: if '-' in args.input: if args.in_glob: sys.exit('cannot use "-i -" and --in-glob together') if len(args.input) > 1: sys.exit('cannot use "-i -" with multiple inputs') input_paths = sys.stdin else: input_paths.extend(args.input) scp_config = config['streamcorpus_pipeline'] stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) for i_str in input_paths: work_unit = SimpleWorkUnit(i_str.strip()) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) ## explicitly call cleanup, which is idempotent pipeline.cleanup()
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file): """minimal end-to-end test, with a fixed pipeline""" with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def main(): parser = argparse.ArgumentParser( conflict_handler='resolve', description='process entire directories using streamcorpus_pipeline') parser.add_argument('directories', nargs='+', metavar='directory', help='directory name(s) to process') args = yakonfig.parse_args(parser, [ yakonfig, rejester, kvlayer, dblogger, streamcorpus_pipeline, DirectoryConfig ]) gconfig = yakonfig.get_global_config() scdconfig = gconfig['streamcorpus_directory'] work_spec = { 'name': scdconfig.get('name', 'streamcorpus_directory'), 'desc': 'read files from a directory', 'min_gb': 8, 'config': gconfig, 'module': 'streamcorpus_pipeline._rejester', 'run_function': 'rejester_run_function', 'terminate_function': 'rejester_terminate_function', } def get_filenames(): for d in args.directories: if scdconfig['mode'] == 'files': yield d elif scdconfig['mode'] == 'file-lists': with open(d, 'r') as f: for line in f: yield line.strip() elif scdconfig['mode'] == 'directories': for dirpath, dirnames, filenames in os.walk(d): for filename in filenames: yield os.path.abspath(os.path.join(dirpath, filename)) work_units = {filename: {'start_count': 0} for filename in get_filenames()} if scdconfig['engine'] == 'rejester': tm = rejester.TaskMaster(gconfig['rejester']) tm.update_bundle(work_spec, work_units) elif scdconfig['engine'] == 'standalone': for k, v in work_units.iteritems(): u = SimpleWorkUnit(k) u.spec = work_spec u.data = v rejester_run_function(u)
def test_external_stage_default(tmpdir): with yakonfig.defaulted_config( [streamcorpus_pipeline], config={ 'streamcorpus_pipeline': { 'external_stages_path': __file__, 'reader': 'from_local_chunks', 'writers': ['to_local_chunks'], 'tmp_dir_path': str(tmpdir), }, }): config = yakonfig.get_global_config('streamcorpus_pipeline', 'external_stage') stage = ExternalStage(config=config) assert stage.get_message() == 'default message'
def main(): '''Launch the AMQP worker.''' filters = { 'already_labeled': already_labeled, 'geotime': geotime, } worker = AMQPWorker(filters) parser = argparse.ArgumentParser() modules = [yakonfig, kvlayer, dblogger, coordinate, worker] yakonfig.parse_args(parser, modules) worker.task_master = coordinate.TaskMaster( yakonfig.get_global_config(coordinate.config_name)) worker.start()
def test_external_stage_unregistered(tmpdir): with yakonfig.defaulted_config( [streamcorpus_pipeline], config={ 'streamcorpus_pipeline': { 'tmp_dir_path': str(tmpdir), 'external_stage': { 'message': 'configured message', }, }, }): config = yakonfig.get_global_config('streamcorpus_pipeline', 'external_stage') stage = ExternalStage(config=config) assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config): """minimal end-to-end test, missing prefetched data""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': True } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = 'test_file.bin' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 with pytest.raises(ConfigurationError): pipeline._process_task(work_unit)
def test_spinn3r_pipeline_filter_no_matches(filename, pipeline_config, output_file): """set a publisher_type filter that matches nothing in the feed""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'publisher_type': 'MICROBLOG' } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) # no chunks means the output file won't actually get written assert not os.path.exists(output_file)
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config, output_file): """set a publisher_type filter that matches everything in the feed""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'publisher_type': 'WEBLOG' } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) work_unit = SimpleWorkUnit(filename) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def chunks(configurator, test_data_dir, overlay={}): with configurator(overlay): path = get_test_v0_3_0_chunk_path(test_data_dir) config = yakonfig.get_global_config('streamcorpus_pipeline', 'to_kvlayer') writer = to_kvlayer(config) ## name_info and i_str are not used by the writer i_str = '' name_info = {} writer(path, name_info, i_str) client = kvlayer.client() client.setup_namespace({ 'stream_items': 2, 'stream_items_doc_id_epoch_ticks': 2, 'stream_items_with_source': 2 }) yield path, client
def v1_suggest_get(request, response, tfidf, akagraph, query): '''Gather suggestions from various engines and within this dossier stack instance and filter/rank them before sending to requestor. ''' if not isinstance(query, unicode): query = query.decode('utf8') config = yakonfig.get_global_config('memex_dossier.models') suggest_services = config.get('suggest_services', []) session = requests.Session() suggestions = [] akagraph_config = config.get('akagraph') if akagraph_config: # (86) 13380344114 #for candidate in phonenumber_matcher( # query, country='CN'): # query = candidate['canonical'] # break cluster = [] logger.info('doing query %r', query) cc = akagraph.find_connected_component(query, use_soft=False) for rec, confidence in cc: rec['confidence'] = confidence cluster.append(rec) suggestions.append(cluster) logger.info('querying %d suggest_services', len(suggest_services)) for url in suggest_services: try: url = url % dict(query=query) except Exception, exc: logger.error('failed to insert query=%r into pattern: %r', query, url) continue try: resp = session.get(url, timeout=5) except Exception, exc: logger.error('failed to retrieve %r', url) continue
def main(): '''Run the random recommender system on a sequence of topics. ''' description = ( 'A baseline recommender system that uses the truth data to' ' create output that has perfect recall and would also have' ' perfect precision if you ignore subtopic diversity/novelty.' ' This generates output directly from the truth data and' ' randomly shuffles the truth data per topic, so that' ' the ordering of passages does not attempt to optimize any' ' particular quality metric.') parser = argparse.ArgumentParser(description=description) parser.add_argument('--overwrite', action='store_true') args = yakonfig.parse_args(parser, [yakonfig]) logging.basicConfig(level=logging.DEBUG) config = yakonfig.get_global_config('harness') batch_size = config.get('batch_size', 5) run_file_path = config['run_file_path'] if os.path.exists(run_file_path): if args.overwrite: os.remove(run_file_path) else: sys.exit('%r already exists' % run_file_path) kvl_config = { 'storage_type': 'local', 'namespace': 'test', 'app_name': 'test' } kvl = kvlayer.client(kvl_config) label_store = LabelStore(kvl) parse_truth_data(label_store, config['truth_data_path']) # Set up the system doc_store = make_doc_store(label_store) system = RandomSystem(doc_store) ambassador = HarnessAmbassadorCLI(system, args.config, batch_size) ambassador.run()
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config, output_file): """configuration explicitly ignores bad prefetched data""" pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = { 'use_prefetched': False } with yakonfig.defaulted_config([streamcorpus_pipeline], config=pipeline_config): stages = PipelineStages() factory = PipelineFactory(stages) pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline')) key = filename from_spinn3r_feed._prefetched[key] = 'bogus data, dude!' work_unit = SimpleWorkUnit(key) work_unit.data['start_chunk_time'] = 0 work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) del from_spinn3r_feed._prefetched[key] with Chunk(path=output_file, mode='rb') as chunk: assert [si.abs_url for si in chunk] == urls
def rejester_run_function(work_unit): with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline], config=work_unit.spec.get('config', {})): scp_config = yakonfig.get_global_config('streamcorpus_pipeline') stages = PipelineStages() if 'external_stages_path' in scp_config: stages.load_external_stages(scp_config['external_stages_path']) if 'external_stages_modules' in scp_config: for mod in scp_config['external_stages_modules']: stages.load_module_stages(mod) factory = PipelineFactory(stages) pipeline = factory(scp_config) work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 pipeline._process_task(work_unit) ## explicitly call cleanup, which is idempotent and might not ## get called by atexit if we are running under ## multiprocessing pipeline.cleanup()