Exemplo n.º 1
0
def test_include_real_paths(reset_globals):
    t1 = tempfile.NamedTemporaryFile()
    t2 = tempfile.NamedTemporaryFile()
    t3 = tempfile.NamedTemporaryFile()
    y1 = u'''
t1:
  k3: !include_yaml %s
  k4: !include_yaml %s
''' % (t2.name, os.path.basename(t3.name))
    print(y1)
    y2 = u'dog'
    y3 = u'two'
    t1.write(y1.encode('utf-8'))
    t2.write(y2.encode('utf-8'))
    t3.write(y3.encode('utf-8'))
    t1.flush()
    t2.flush()
    t3.flush()

    config = set_global_config(t1.name)
    assert get_global_config() is config
    print(config)
    sub_config = get_global_config('t1')
    assert sub_config is config['t1']
    assert sub_config['k3'] == y2
    assert sub_config['k4'] == y3
def test_kvlayer_simple(configurator, tmpdir):
    si = streamcorpus.make_stream_item('2000-01-01T12:34:00.000123Z',
                                       'test://test.stream.item/')
    chunkfile = str(tmpdir.join('chunk.sc.xz'))
    with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk:
        chunk.add(si)

    with configurator():
        writer = to_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'to_kvlayer'))
        writer(chunkfile, {}, '')

        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'stream_items': 2})
        print repr(list(kvlclient.scan_keys('stream_items')))
        for (k,v) in kvlclient.get(
                'stream_items',
                (uuid.UUID(int=946730040),
                 uuid.UUID(hex='985c1e3ed73256cd9a399919fe93cf76'))):
            assert v is not None

        reader = from_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'from_kvlayer'))
        sis = list(reader(''))
        assert len(sis) == 1
        assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks
        assert sis[0].abs_url == si.abs_url
Exemplo n.º 3
0
def test_normalize():
    with yakonfig.defaulted_config([Normalized]):
        assert yakonfig.get_global_config('normalized')['k'] == 'v'
    with yakonfig.defaulted_config([Normalized], { 'k': 'foo' }):
        assert yakonfig.get_global_config('normalized')['k'] == 'f'
    with yakonfig.defaulted_config([Normalized],
                                   yaml='''
normalized:
  k: zoom!
'''):
        assert yakonfig.get_global_config('normalized')['k'] == 'z'
Exemplo n.º 4
0
def v1_folder_extract_post(fid, sid):
    conf = yakonfig.get_global_config('coordinate')
    tm = coordinate.TaskMaster(conf)
    key = cbor.dumps((fid, sid))
    wu_status = tm.get_work_unit_status('ingest', key)
    if wu_status and wu_status['status'] in (AVAILABLE, BLOCKED, PENDING):
        return {'state': 'pending'}
    else:
        logger.info('launching async work unit for %r', (fid, sid))
        conf = yakonfig.get_global_config('coordinate')
        tm = coordinate.TaskMaster(conf)
        tm.add_work_units('ingest', [(cbor.dumps((fid, sid)), {})])
        return {'state': 'submitted'}
Exemplo n.º 5
0
def v1_folder_extract_post(fid, sid):
    conf = yakonfig.get_global_config('coordinate')
    tm = coordinate.TaskMaster(conf)
    key = cbor.dumps((fid, sid))
    wu_status = tm.get_work_unit_status('ingest', key)
    if wu_status and wu_status['status'] in (AVAILABLE, BLOCKED, PENDING):
        return {'state': 'pending'}
    else:
        logger.info('launching async work unit for %r', (fid, sid))
        conf = yakonfig.get_global_config('coordinate')
        tm = coordinate.TaskMaster(conf)
        tm.add_work_units('ingest', [(cbor.dumps((fid, sid)), {})])
        return {'state': 'submitted'}
Exemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(
        'Command line interface to the office TREC DD jig.',
        usage=usage,
        conflict_handler='resolve')
    parser.add_argument('command', help='must be "load", "init", "start", "step", or "stop"')
    parser.add_argument('args', help='input for given command',
                        nargs=argparse.REMAINDER)
    modules = [yakonfig, kvlayer, Harness]
    args = yakonfig.parse_args(parser, modules)

    logging.basicConfig(level=logging.DEBUG)

    if args.command not in set(['load', 'init', 'start', 'step', 'stop']):
        sys.exit('The only valid commands are "load", "init", "start", "step", and "stop".')

    kvl = kvlayer.client()
    label_store = LabelStore(kvl)
    config = yakonfig.get_global_config('harness')
    harness = Harness(config, kvl, label_store)

    if args.command == 'load':
        if not config.get('truth_data_path'):
            sys.exit('Must provide --truth-data-path as an argument')
        if not os.path.exists(config['truth_data_path']):
            sys.exit('%r does not exist' % config['truth_data_path'])
        parse_truth_data(label_store, config['truth_data_path'])
        logger.info('Done!  The truth data was loaded into this '
                     'kvlayer backend:\n%s',
                    json.dumps(yakonfig.get_global_config('kvlayer'),
                               indent=4, sort_keys=True))

    elif args.command == 'init':
        response = harness.init()
        print(json.dumps(response))

    elif args.command == 'start':
        response = harness.start()
        print(json.dumps(response))

    elif args.command == 'stop':
        response = harness.stop(args.args[0])
        print(json.dumps(response))

    elif args.command == 'step':
        parts = args.args
        topic_id = parts.pop(0)
        feedback = harness.step(topic_id, parts)
        print(json.dumps(feedback))
Exemplo n.º 7
0
def test_include_abstract(reset_globals, monkeypatch_open):
    YAML_TEXT_TWO = StringIO('''
app_one:
  one: car

app_two:
  bad: [cat, horse]
  good: !include /some-path-that-will-not-be-used
''')
    config = set_global_config(YAML_TEXT_TWO)
    
    assert get_global_config() is config
    sub_config = get_global_config('app_two')

    assert sub_config is config['app_two']
    assert sub_config['good'] == dict(k1='v1', k2=['v21'])
Exemplo n.º 8
0
def dragnet_status():
    conf = yakonfig.get_global_config('coordinate')
    tm = coordinate.TaskMaster(conf)
    wu_status = tm.get_work_unit_status('dragnet', DRAGNET_KEY)
    if not wu_status: return None
    status = wu_status['status']
    return status
Exemplo n.º 9
0
def client(config=None, storage_type=None, *args, **kwargs):
    '''Create a kvlayer client object.

    With no arguments, gets the global :mod:`kvlayer` configuration
    from :mod:`yakonfig` and uses that.  A `config` dictionary, if
    provided, is used in place of the :mod:`yakonfig` configuration.
    `storage_type` overrides the corresponding field in the
    configuration, but it must be supplied in one place or the other.
    Any additional parameters are passed to the corresponding
    backend's constructor.

    >>> local_storage = kvlayer.client(config={}, storage_type='local',
    ...                                app_name='app', namespace='ns')

    If there is additional configuration under the value of
    `storage_type`, that is overlaid over `config` and passed to the
    storage implementation.

    :param dict config: :mod:`kvlayer` configuration dictionary
    :param str storage_type: name of storage implementation
    :raise kvlayer._exceptions.ConfigurationError: if `storage_type`
      is not provided or is invalid

    '''
    global _load_entry_point_kvlayer_impls_done
    if config is None:
        config = yakonfig.get_global_config('kvlayer')
    if storage_type is None:
        try:
            storage_type = config['storage_type']
        except KeyError, exc:
            raise ConfigurationError(
                'No storage_type in kvlayer configuration')
Exemplo n.º 10
0
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument('--host', default=None,  # NOT -h, that's help
                    help='host that coordinated will listen on, '
                    '0.0.0.0 for any input interface')
    ap.add_argument('--port', '-p', type=int, default=None,
                    help='port number that coordinated will listen on')
    ap.add_argument('--pid', default=None,
                    help='file to write pid to')
    ap.add_argument('--snapshot-dir', default=None,
                    help='direcotry to write snapshots to')
    ap.add_argument('--httpd', default=None,
                    help='ip:port or :port to serve http info on')
    if yappi is not None:
        ap.add_argument('--yappi', default=None, help='file to write yappi profiling to. will be suffied by {timestamp}.txt')
    args = yakonfig.parse_args(ap, [yakonfig, dblogger, coordinate])

    if args.pid:
        with open(args.pid, 'w') as f:
            f.write(str(os.getpid()))

    if args.snapshot_dir is not None:
        cjqconfig = yakonfig.get_global_config('coordinate', 'job_queue')
        # (This modifies the global configuration in place)
        cjqconfig['snapshot_path_format'] = os.path.join(
            args.snapshot_dir, 'snapshot_{timestamp}')

    if (yappi is not None) and args.yappi:
        yappi.start()
        yt = threading.Thread(target=yappi_logger, args=(args.yappi,))
        yt.daemon = True
        yt.start()

    daemon = CoordinateServer(host=args.host, port=args.port, httpd=args.httpd)
    daemon.run()
Exemplo n.º 11
0
    def process_path(self, chunk_path):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        tmp_dir = os.path.join(scp_config['tmp_dir_path'], str(uuid.uuid4()))
        os.mkdir(tmp_dir)
        par_file = self.config['par']
        tagger_root_path = os.path.join(self.config['third_dir_path'], 
                                        self.config['path_in_third'])

        par_path = self._write_config_par(tmp_dir, par_file, tagger_root_path)

        tmp_chunk_path = os.path.join(tmp_dir, 'output', os.path.basename(chunk_path))

        cmd = [
            os.path.join(tagger_root_path, self.config.get('serif_exe', 'bin/x86_64/Serif')),
            par_path,
            '-o', tmp_dir,
            chunk_path,
        ]

        logger.info('serif cmd: %r', cmd)

        start_time = time.time()
        ## make sure we are using as little memory as possible
        gc.collect()
        try:
            self._child = subprocess.Popen(cmd, stderr=subprocess.PIPE, shell=False)
        except OSError, exc:
            logger.error('error running serif cmd %r', cmd, exc_info=True)
            msg = traceback.format_exc(exc) 
            msg += make_memory_info_msg()
            logger.critical(msg)
            raise
Exemplo n.º 12
0
def test_pipeline(request, test_data_dir):
    filename=str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename):
        ## config says read from stdin, so make that have what we want
        stdin = sys.stdin
        sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

        ## run the pipeline
        stages = PipelineStages()
        pf = PipelineFactory(stages)
        p = pf(yakonfig.get_global_config('streamcorpus_pipeline'))

        from streamcorpus_pipeline.run import SimpleWorkUnit
        work_unit = SimpleWorkUnit('long string indicating source of text')
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        g = gevent.spawn(p._process_task, work_unit)

        gevent.sleep(5)

        with pytest.raises(SystemExit):  # pylint: disable=E1101
            p.shutdown(sig=signal.SIGTERM)

        logger.debug('now joining...')
        timeout = gevent.Timeout(1)
        g.join(timeout=timeout)
Exemplo n.º 13
0
def main():
    p = argparse.ArgumentParser('simple debugging tool for watching the linker and OpenQuery')
    p.add_argument('action', help='either `run` or `cache` or `delete`')
    p.add_argument('folder', help='folder name')
    p.add_argument('subfolder', help='subfolder name')
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])

    config = yakonfig.get_global_config()

    key = cbor.dumps((args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_')))

    if args.action == 'run':
        web_conf = Config()
        with yakonfig.defaulted_config([kvlayer, dblogger, web_conf], config=config):
            traverse_extract_fetch(web_conf, key, stop_after_extraction=True)

    elif args.action == 'delete':
        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str,)})
        kvlclient.delete('openquery', (key,))
        print('deleted %r' % key)

    elif args.action == 'cache':

        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str,)})
        count = 0
        for rec in kvlclient.scan('openquery'):
            count += 1
            if rec[0][0] == key:
                print rec
        print('%d cached queries' % count)
Exemplo n.º 14
0
def test_kvlayer_reader_and_writer(configurator, test_data_dir):
    with chunks(configurator, test_data_dir) as (path, client):
        ## check that index table was created
        all_doc_ids = set()
        all_epoch_ticks = set()
        for (doc_id, epoch_ticks), empty_data in client.scan('stream_items_doc_id_epoch_ticks'):
            all_doc_ids.add(doc_id)
            all_epoch_ticks.add(epoch_ticks)
        all_doc_ids = sorted(all_doc_ids)
        all_epoch_ticks = sorted(all_epoch_ticks)
        logger.info('%d doc_ids', len(all_doc_ids))

        ## make an reader
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'from_kvlayer')
        reader = from_kvlayer(config)

        ## test it with different i_str inputs:
        for i_str in ['', '0,,%d,' % 10**10, '%d,%s,%d,%s' %
                      (all_epoch_ticks[0],  all_doc_ids[0],
                       all_epoch_ticks[-1], all_doc_ids[-1]) ]:
            stream_ids = []
            for si in reader(i_str):
                stream_ids.append(si.stream_id)
            _input_chunk_ids = [si.stream_id for si in streamcorpus.Chunk(path)]
            input_chunk_ids = list(set(_input_chunk_ids))
            logger.info('%d inserts, %d unique',
                        len(_input_chunk_ids), len(input_chunk_ids))
            input_chunk_ids.sort()
            stream_ids.sort()
            assert len(input_chunk_ids) == len(stream_ids)
            assert input_chunk_ids == stream_ids
Exemplo n.º 15
0
def dragnet_status():
    conf = yakonfig.get_global_config('coordinate')
    tm = coordinate.TaskMaster(conf)
    wu_status = tm.get_work_unit_status('dragnet', DRAGNET_KEY)
    if not wu_status: return None
    status = wu_status['status']
    return status
Exemplo n.º 16
0
def main():
    p = argparse.ArgumentParser(
        'simple debugging tool for watching the linker and OpenQuery')
    p.add_argument('action', help='either `run` or `cache` or `delete`')
    p.add_argument('folder', help='folder name')
    p.add_argument('subfolder', help='subfolder name')
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])

    config = yakonfig.get_global_config()

    key = cbor.dumps(
        (args.folder.replace(' ', '_'), args.subfolder.replace(' ', '_')))

    if args.action == 'run':
        web_conf = Config()
        with yakonfig.defaulted_config([kvlayer, dblogger, web_conf],
                                       config=config):
            traverse_extract_fetch(web_conf, key, stop_after_extraction=True)

    elif args.action == 'delete':
        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str, )})
        kvlclient.delete('openquery', (key, ))
        print('deleted %r' % key)

    elif args.action == 'cache':

        kvlclient = kvlayer.client()
        kvlclient.setup_namespace({'openquery': (str, )})
        count = 0
        for rec in kvlclient.scan('openquery'):
            count += 1
            if rec[0][0] == key:
                print rec
        print('%d cached queries' % count)
Exemplo n.º 17
0
    def __init__(self, host=None, port=None, config=None, httpd=None):
        self.config = config or yakonfig.get_global_config(
            'coordinate', 'server')
        self.host = host or self._cfget('host')
        self.port = port or self._cfget('port')
        self.do_rpclog = self._cfget('rpclog')

        # TCPServer
        self.server = None
        # There is one server state instance which is the target
        # for all request handlers.
        # Methods should synchronize as needed and be thread safe.
        self.pobj = MultiBackendProxyObject(
            self.do_rpclog,
            module_config=self._cfget('modules'),
            module_instances=[JobQueue()]
        )

        self.httpd_params = httpd or self._cfget('httpd')
        if self.httpd_params is not None and ':' not in self.httpd_params:
            raise ProgrammerError(
                'httpd config needs ip:port or :port to serve on, got {!r}'
                .format(self.httpd_params))
        self.httpd = None
        self.httpd_thread = None
def make_rejester_jobs(task_master, kvl, sources, work_spec_name):
    '''Create :mod:`rejester` jobs for inbound stream items.

    Each job runs :func:`rejester_run` in this module to run the
    elasticsearch index over a set of stream items.

    :param kvl: kvlayer client
    :type kvl: :class:`kvlayer._abstract_storage.AbstractStorage`
    :param list sources: source name strings to consider, or
      :const:`None` for all
    :param str work_spec_name: name of the rejester work spec

    '''
    work_spec = {
        'name': work_spec_name,
        'desc': 'elasticsearch loader',
        'min_gb': 1,
        'config': yakonfig.get_global_config(),
        'module': 'diffeo_search_tools.rejester_runner',
        'run_function': 'rejester_run',
        'terminate_function': 'rejester_terminate',
    }
    si_iter = _sid_iter(kvl, sources)
    # No value needed in following dict
    work_units = { key: 0 for key in si_iter }
    # work_units = { 'item': si_iter.next() }
    task_master.update_bundle(work_spec, work_units)
Exemplo n.º 19
0
def test_pipeline(request, test_data_dir):
    filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml'))
    with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename):
        ## config says read from stdin, so make that have what we want
        stdin = sys.stdin
        sys.stdin = StringIO(get_test_chunk_path(test_data_dir))

        ## run the pipeline
        stages = PipelineStages()
        pf = PipelineFactory(stages)
        p = pf(yakonfig.get_global_config('streamcorpus_pipeline'))

        from streamcorpus_pipeline.run import SimpleWorkUnit
        work_unit = SimpleWorkUnit('long string indicating source of text')
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        g = gevent.spawn(p._process_task, work_unit)

        gevent.sleep(5)

        with pytest.raises(SystemExit):  # pylint: disable=E1101
            p.shutdown(sig=signal.SIGTERM)

        logger.debug('now joining...')
        timeout = gevent.Timeout(1)
        g.join(timeout=timeout)
Exemplo n.º 20
0
def main(options):
    """Run the recommender system on a sequence of topics.
    """
    description = "System using LDA, Kmeans and Solr to optimize diversification and exploitation of different topics"
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument("--overwrite", action="store_true")
    args = yakonfig.parse_args(parser, [yakonfig])

    logging.basicConfig(level=logging.DEBUG)

    config = yakonfig.get_global_config("harness")
    batch_size = config.get("batch_size", 5)
    run_file_path = config["run_file_path"]
    if os.path.exists(run_file_path):
        if args.overwrite:
            os.remove(run_file_path)
        else:
            os.remove(run_file_path)
            # sys.exit('%r already exists' % run_file_path)

    kvl_config = {"storage_type": "local", "namespace": "test", "app_name": "test"}
    kvl = kvlayer.client(kvl_config)
    method, feedback_options, poids, id_config = options[0], options[1], options[2], options[3]
    print method, poids
    system = SearchSystem([], method, feedback_options, poids)
    print args.config
    args.config = "config" + str(id_config) + ".yaml"
    print args.config
    ambassador = HarnessAmbassadorCLI(system, args.config, batch_size)
    ambassador.run()
Exemplo n.º 21
0
def v1_folder_extract_get(request, response, kvlclient, store, fid, sid):
    conf = yakonfig.get_global_config('coordinate')
    tm = coordinate.TaskMaster(conf)
    key = cbor.dumps((fid, sid))
    wu_status = tm.get_work_unit_status('ingest', key)
    status = wu_status['status']
    if status in (AVAILABLE, BLOCKED, PENDING):
        return {'state': 'pending'}
    elif status in (FINISHED,):
        kvlclient.setup_namespace({'openquery': (str,)})
        data = None
        try:
            data = list(kvlclient.get('openquery', (key,)))
            assert len(data) == 1, data
            logger.info('got data of len 1: %r', data)
            assert data[0], data
            assert data[0][1], data
            data = data[0][1]
            data = json.loads(data)
            data['state'] = 'done'
            return data
        except:
            logger.info('kvlclient: %r', kvlclient)
            logger.error('Failed to get openquery data: %r', data, exc_info=True)
            return {'state': 'failed'}

    else:
        return {'state': 'failed'}
Exemplo n.º 22
0
def test_archive_by_count(xconfig, jobqueue_conf):
    config = dict(yakonfig.get_global_config('coordinate', 'job_queue'))
    config['limit_completed_count'] = 2
    config.update(jobqueue_conf)
    job_queue = JobQueue(config)
    if job_queue.postgres_connect_string:
        pytest.skip('TODO: postgres has not implemented archive by count')
    job_queue.set_work_spec({'name': 'ws1'})
    job_queue.add_work_units('ws1', [('wu1', {'x': 1}),
                                     ('wu2', {'x': 1}),
                                     ('wu3', {'x': 1})])
    # Bump all three work units to "finished"
    for wu in ['wu1', 'wu2', 'wu3']:
        wu_parts, msg = job_queue.get_work('id1', {})
        assert wu_parts[0] == 'ws1'
        assert wu_parts[1] == wu
        job_queue.update_work_unit(wu_parts[0], wu_parts[1],
                                   {'status': FINISHED})

    # Archiving hasn't happened, so we should see the finished count
    # is 3, and all three work units are there
    counts, msg = job_queue.count_work_units('ws1')
    assert counts[FINISHED] == 3
    wus, msg = job_queue.get_work_units('ws1', {})
    assert [wu[0] for wu in wus] == ['wu1', 'wu2', 'wu3']

    job_queue.archive()

    # Now we should still see the same count, but the one that ran
    # first (wu1) is off the list
    counts, msg = job_queue.count_work_units('ws1')
    assert counts[FINISHED] == 3
    wus, msg = job_queue.get_work_units('ws1', {})
    assert [wu[0] for wu in wus] == ['wu2', 'wu3']
Exemplo n.º 23
0
    def as_child(cls, global_config, parent=None):
        '''Run a single job in a child process.

        This method never returns; it always calls :func:`sys.exit`
        with an error code that says what it did.

        '''
        try:
            setproctitle('rejester worker')
            random.seed()  # otherwise everyone inherits the same seed
            yakonfig.set_default_config([yakonfig, dblogger, rejester],
                                        config=global_config)
            worker = cls(yakonfig.get_global_config(rejester.config_name))
            worker.register(parent=parent)
            did_work = worker.run(set_title=True)
            worker.unregister()
            if did_work:
                sys.exit(cls.EXIT_SUCCESS)
            else:
                sys.exit(cls.EXIT_BORED)
        except Exception, e:
            # There's some off chance we have logging.
            # You will be here if redis is down, for instance,
            # and the yakonfig dblogger setup runs but then
            # the get_work call fails with an exception.
            if len(logging.root.handlers) > 0:
                logger.critical('failed to do any work', exc_info=e)
            sys.exit(cls.EXIT_EXCEPTION)
Exemplo n.º 24
0
def v1_folder_extract_get(request, response, kvlclient, store, fid, sid):
    conf = yakonfig.get_global_config('coordinate')
    tm = coordinate.TaskMaster(conf)
    key = cbor.dumps((fid, sid))
    wu_status = tm.get_work_unit_status('ingest', key)
    status = wu_status['status']
    if status in (AVAILABLE, BLOCKED, PENDING):
        return {'state': 'pending'}
    elif status in (FINISHED, ):
        kvlclient.setup_namespace({'openquery': (str, )})
        data = None
        try:
            data = list(kvlclient.get('openquery', (key, )))
            assert len(data) == 1, data
            logger.info('got data of len 1: %r', data)
            assert data[0], data
            assert data[0][1], data
            data = data[0][1]
            data = json.loads(data)
            data['state'] = 'done'
            return data
        except:
            logger.info('kvlclient: %r', kvlclient)
            logger.error('Failed to get openquery data: %r',
                         data,
                         exc_info=True)
            return {'state': 'failed'}

    else:
        return {'state': 'failed'}
Exemplo n.º 25
0
def test_yakonfig_default():
    yakonfig.set_default_config([yakonfig])
    try:
        c = yakonfig.get_global_config()
        assert 'yakonfig' in c
    finally:
        yakonfig.clear_global_config()
Exemplo n.º 26
0
def test_replaces_proxy():
    with yakonfig.defaulted_config([ConfigurableLikeTop]):
        c = yakonfig.get_global_config()
        assert sorted(iterkeys(c)) == ['top']
        c = c['top']
        assert 'bottom' in c
        c = c['bottom']
        assert c['zzz'] == '-32768'
Exemplo n.º 27
0
def test_yakonfig_cli():
    parser = argparse.ArgumentParser()
    yakonfig.parse_args(parser, [yakonfig], args=[])
    try:
        c = yakonfig.get_global_config()
        assert 'yakonfig' in c
    finally:
        yakonfig.clear_global_config()    
Exemplo n.º 28
0
    def configured(cls):
        '''Create a new instance from the global configuration.

        In order to use this, you must make sure that
        :class:`ElasticStore` has been configured by :mod:`yakonfig`,
        usually by passing the class to ``yakonfig.parse_args``.
        '''
        return cls(**yakonfig.get_global_config('dossier.store'))
Exemplo n.º 29
0
    def configured(cls):
        '''Create a new instance from the global configuration.

        In order to use this, you must make sure that
        :class:`ElasticStore` has been configured by :mod:`yakonfig`,
        usually by passing the class to ``yakonfig.parse_args``.
        '''
        return cls(**yakonfig.get_global_config('dossier.store'))
def test_kvlayer_negative(configurator, tmpdir):
    si = streamcorpus.make_stream_item('1969-07-20T20:18:00.000000Z',
                                       'test://test.stream.item/')
    chunkfile = str(tmpdir.join('chunk.sc.xz'))
    with streamcorpus.Chunk(path=chunkfile, mode='wb') as chunk:
        chunk.add(si)

    with configurator():
        writer = to_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'to_kvlayer'))
        writer(chunkfile, {}, '')

        reader = from_kvlayer(yakonfig.get_global_config(
            'streamcorpus_pipeline', 'from_kvlayer'))
        sis = list(reader(''))
        assert len(sis) == 1
        assert sis[0].stream_time.epoch_ticks == si.stream_time.epoch_ticks
        assert sis[0].abs_url == si.abs_url
Exemplo n.º 31
0
    def add_flow(self, flow, config=None):
        '''Add a series of related work specs.

        `flow` is a dictionary, where the keys are work spec names and
        the values are either abbreviated work spec definitions or
        flow dictionaries that could be recursively passed into this
        function.  Each work spec is amended by adding `name` based on
        its key and ancestry, and by adding `config` as either the
        `config` parameter or the current global configuration if
        :const:`None`.

        If a given work spec contains a `config` parameter, that
        parameter is overlaid over the provided configuration for that
        specific work spec.  Thus, a work spec may contain a partial
        configuration for a specific setup of
        :mod:`streamcorpus_pipeline`, and the global configuration
        can contain shared settings for :mod:`kvlayer`.

        :param dict flow: work spec or dictionary of work specs
        :param dict config: global config to save with work specs
        :see: :func:`yakonfig.overlay_config`

        '''
        # default to the global config
        if config is None:
            config = yakonfig.get_global_config()

        # collect all work specs and trap errors before submitting any
        work_specs = {}
        worklist = flow.items()
        while worklist:
            (k, v) = worklist.pop()
            if not isinstance(v, collections.Mapping):
                raise ProgrammerError('invalid work spec flow definition')
            if 'min_gb' in v:
                # d is a work spec
                v['name'] = k
                if 'config' in v:
                    v['config'] = yakonfig.overlay_config(config, v['config'])
                else:
                    v['config'] = config
                work_specs[k] = v
            else:
                for kk, vv in v.iteritems():
                    worklist.append((k + '.' + kk, vv))

        # check that chaining is correct
        for k, v in work_specs.iteritems():
            if 'then' in v:
                if v['then'] not in work_specs:
                    raise ProgrammerError(
                        'work spec {} chained to invalid work spec {}'
                        .format(k, v['then']))

        # all good, submit them all
        for d in work_specs.itervalues():
            self.set_work_spec(d)
Exemplo n.º 32
0
 def __init__(self, *args, **kwargs):
     # A horrible hack to create a new `Folders` instance with config.
     try:
         config = yakonfig.get_global_config('dossier.folders')
         # For old configs.
         if 'prefix' in config:
             config['namespace'] = config.pop('prefix')
     except KeyError:
         config = {}
     super(Folders, self).__init__(*args, **dict(config, **kwargs))
Exemplo n.º 33
0
 def __init__(self, *args, **kwargs):
     # A horrible hack to create a new `Folders` instance with config.
     try:
         config = yakonfig.get_global_config('dossier.folders')
         # For old configs.
         if 'prefix' in config:
             config['namespace'] = config.pop('prefix')
     except KeyError:
         config = {}
     super(Folders, self).__init__(*args, **dict(config, **kwargs))
Exemplo n.º 34
0
def v1_dragnet():
    status = dragnet_status()
    if not status or status in (FINISHED, FAILED):
        logger.info('launching dragnet async work unit')
        conf = yakonfig.get_global_config('coordinate')
        tm = coordinate.TaskMaster(conf)
        tm.add_work_units('dragnet', [(DRAGNET_KEY, {})])
        return {'state': 'submitted'}
    else:
        return {'state': 'pending'}
Exemplo n.º 35
0
def v1_dragnet():
    status = dragnet_status()
    if not status or status in (FINISHED, FAILED):
        logger.info('launching dragnet async work unit')
        conf = yakonfig.get_global_config('coordinate')
        tm = coordinate.TaskMaster(conf)
        tm.add_work_units('dragnet', [(DRAGNET_KEY, {})])
        return {'state': 'submitted'}
    else:
        return {'state': 'pending'}
Exemplo n.º 36
0
def test_cli_good():
    parser = argparse.ArgumentParser()
    yakonfig.parse_args(parser, [ConfigurableArgs()], args=['--key', 'v'])
    try:
        c = yakonfig.get_global_config()
        assert 'config' in c
        assert 'k' in c['config']
        assert c['config']['k'] == 'v'
    finally:
        yakonfig.clear_global_config()
Exemplo n.º 37
0
 def store(self):
     if self._store is None:
         feature_indexes = None
         try:
             conf = yakonfig.get_global_config("dossier.store")
             feature_indexes = conf["feature_indexes"]
         except KeyError:
             pass
         self._store = Store(kvlayer.client(), feature_indexes=feature_indexes)
     return self._store
Exemplo n.º 38
0
def main():
    p = argparse.ArgumentParser()
    args = yakonfig.parse_args(p, [kvlayer, yakonfig])

    config = yakonfig.get_global_config()

    class Empty(object): pass
    e = Empty()
    e.spec = dict(config=config)
    worker(e)
Exemplo n.º 39
0
def main():
    conf = Autoconfig(an_object)
    parser = argparse.ArgumentParser()
    args = yakonfig.parse_args(parser, [conf])
    config = yakonfig.get_global_config()
    print "The global configuration:"
    print config
    print
    obj = conf(config)
    print "The object:"
    print obj
Exemplo n.º 40
0
 def store(self):
     if self._store is None:
         feature_indexes = None
         try:
             conf = yakonfig.get_global_config('dossier.store')
             feature_indexes = conf['feature_indexes']
         except KeyError:
             pass
         self._store = Store(kvlayer.client(),
                             feature_indexes=feature_indexes)
     return self._store
Exemplo n.º 41
0
def new_folders(kvlclient, request):
    try:
        config = yakonfig.get_global_config('memex_dossier.folders')
        # For old configs.
        if 'prefix' in config:
            config['namespace'] = config.pop('prefix')
    except KeyError:
        config = {}
    if 'annotator_id' in request.query:
        config['owner'] = request.query['annotator_id']
    return Folders(kvlclient, **config)
Exemplo n.º 42
0
 def __init__(self, *args, **kwargs):
     super(to_dossier_store, self).__init__(*args, **kwargs)
     kvl = kvlayer.client()
     feature_indexes = None
     try:
         conf = yakonfig.get_global_config('dossier.store')
         feature_indexes = conf['feature_indexes']
     except KeyError:
         pass
     self.store = Store(kvl, feature_indexes=feature_indexes)
     tfidf_path = self.config.get('tfidf_path')
     self.tfidf = gensim.models.TfidfModel.load(tfidf_path)
Exemplo n.º 43
0
def main():
    parser = argparse.ArgumentParser('test tool for checking that we can load '
                                     'the truth data as distributed by NIST for '
                                     'TREC 2015')
    parser.add_argument('truth_data_path', help='path to truth data file')
    modules = [yakonfig, kvlayer]
    args = yakonfig.parse_args(parser, modules)
    logging.basicConfig(level=logging.DEBUG)
    kvl = kvlayer.client()
    label_store = LabelStore(kvl)
    parse_truth_data(label_store, args.truth_data_path)
    logger.debug('Done!  The truth data was loaded into this kvlayer backend: %r',
                 json.dumps(yakonfig.get_global_config('kvlayer'), indent=4,
                            sort_keys=True))
def test_spinn3r_pipeline_bogus_prefetched(filename, pipeline_config):
    """supply known-bad prefetched data"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(DecodeError):
            pipeline._process_task(work_unit)
Exemplo n.º 45
0
def main():
    import argparse
    parser = argparse.ArgumentParser(
        description='process a sequence of stream items',
        usage='streamcorpus_pipeline --config config.yaml --input file.in')
    parser.add_argument('-i', '--input', action='append', 
                        help='file paths to input instead of reading from stdin')
    parser.add_argument('--in-glob', action='append', default=[], help='path glob specifying input files')
    parser.add_argument('--third-dir-path', help='path to third-party tools directory')
    parser.add_argument('--tmp-dir-path', help='path to temporary directory for scratch files, can be large')

    modules = [yakonfig, kvlayer, dblogger, streamcorpus_pipeline]
    args = yakonfig.parse_args(parser, modules)
    config = yakonfig.get_global_config()

    ## this modifies the global config, passed by reference
    instantiate_config(config)

    input_paths = []
    if args.in_glob:
        for pattern in args.in_glob:
            input_paths.extend(glob.glob(pattern))
    if args.input:
        if '-' in args.input:
            if args.in_glob:
                sys.exit('cannot use "-i -" and --in-glob together')
            if len(args.input) > 1:
                sys.exit('cannot use "-i -" with multiple inputs')
            input_paths = sys.stdin
        else:
            input_paths.extend(args.input)

    scp_config = config['streamcorpus_pipeline']
    stages = PipelineStages()
    if 'external_stages_path' in scp_config:
        stages.load_external_stages(scp_config['external_stages_path'])
    if 'external_stages_modules' in scp_config:
        for mod in scp_config['external_stages_modules']:
            stages.load_module_stages(mod)
    factory = PipelineFactory(stages)
    pipeline = factory(scp_config)

    for i_str in input_paths:
        work_unit = SimpleWorkUnit(i_str.strip())
        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

    ## explicitly call cleanup, which is idempotent
    pipeline.cleanup()
def test_spinn3r_pipeline(filename, urls, pipeline_config, output_file):
    """minimal end-to-end test, with a fixed pipeline"""
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
Exemplo n.º 47
0
def main():
    parser = argparse.ArgumentParser(
        conflict_handler='resolve',
        description='process entire directories using streamcorpus_pipeline')
    parser.add_argument('directories',
                        nargs='+',
                        metavar='directory',
                        help='directory name(s) to process')
    args = yakonfig.parse_args(parser, [
        yakonfig, rejester, kvlayer, dblogger, streamcorpus_pipeline,
        DirectoryConfig
    ])
    gconfig = yakonfig.get_global_config()
    scdconfig = gconfig['streamcorpus_directory']

    work_spec = {
        'name': scdconfig.get('name', 'streamcorpus_directory'),
        'desc': 'read files from a directory',
        'min_gb': 8,
        'config': gconfig,
        'module': 'streamcorpus_pipeline._rejester',
        'run_function': 'rejester_run_function',
        'terminate_function': 'rejester_terminate_function',
    }

    def get_filenames():
        for d in args.directories:
            if scdconfig['mode'] == 'files':
                yield d
            elif scdconfig['mode'] == 'file-lists':
                with open(d, 'r') as f:
                    for line in f:
                        yield line.strip()
            elif scdconfig['mode'] == 'directories':
                for dirpath, dirnames, filenames in os.walk(d):
                    for filename in filenames:
                        yield os.path.abspath(os.path.join(dirpath, filename))

    work_units = {filename: {'start_count': 0} for filename in get_filenames()}

    if scdconfig['engine'] == 'rejester':
        tm = rejester.TaskMaster(gconfig['rejester'])
        tm.update_bundle(work_spec, work_units)
    elif scdconfig['engine'] == 'standalone':
        for k, v in work_units.iteritems():
            u = SimpleWorkUnit(k)
            u.spec = work_spec
            u.data = v
            rejester_run_function(u)
Exemplo n.º 48
0
def test_external_stage_default(tmpdir):
    with yakonfig.defaulted_config(
        [streamcorpus_pipeline],
            config={
                'streamcorpus_pipeline': {
                    'external_stages_path': __file__,
                    'reader': 'from_local_chunks',
                    'writers': ['to_local_chunks'],
                    'tmp_dir_path': str(tmpdir),
                },
            }):
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'external_stage')
        stage = ExternalStage(config=config)
        assert stage.get_message() == 'default message'
Exemplo n.º 49
0
def main():
    '''Launch the AMQP worker.'''
    filters = {
        'already_labeled': already_labeled,
        'geotime': geotime,
    }

    worker = AMQPWorker(filters)
    parser = argparse.ArgumentParser()
    modules = [yakonfig, kvlayer, dblogger, coordinate, worker]
    yakonfig.parse_args(parser, modules)
    worker.task_master = coordinate.TaskMaster(
        yakonfig.get_global_config(coordinate.config_name))

    worker.start()
Exemplo n.º 50
0
def test_external_stage_unregistered(tmpdir):
    with yakonfig.defaulted_config(
        [streamcorpus_pipeline],
            config={
                'streamcorpus_pipeline': {
                    'tmp_dir_path': str(tmpdir),
                    'external_stage': {
                        'message': 'configured message',
                    },
                },
            }):
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'external_stage')
        stage = ExternalStage(config=config)
        assert stage.get_message() == 'configured message'
def test_spinn3r_pipeline_unprefetched(urls, pipeline_config):
    """minimal end-to-end test, missing prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': True
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = 'test_file.bin'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        with pytest.raises(ConfigurationError):
            pipeline._process_task(work_unit)
def test_spinn3r_pipeline_filter_no_matches(filename, pipeline_config,
                                            output_file):
    """set a publisher_type filter that matches nothing in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'MICROBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        # no chunks means the output file won't actually get written
        assert not os.path.exists(output_file)
def test_spinn3r_pipeline_filter_matches(filename, urls, pipeline_config,
                                         output_file):
    """set a publisher_type filter that matches everything in the feed"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'publisher_type': 'WEBLOG'
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        work_unit = SimpleWorkUnit(filename)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
Exemplo n.º 54
0
def chunks(configurator, test_data_dir, overlay={}):
    with configurator(overlay):
        path = get_test_v0_3_0_chunk_path(test_data_dir)
        config = yakonfig.get_global_config('streamcorpus_pipeline',
                                            'to_kvlayer')
        writer = to_kvlayer(config)

        ## name_info and i_str are not used by the writer
        i_str = ''
        name_info = {}
        writer(path, name_info, i_str)

        client = kvlayer.client()
        client.setup_namespace({
            'stream_items': 2,
            'stream_items_doc_id_epoch_ticks': 2,
            'stream_items_with_source': 2
        })
        yield path, client
Exemplo n.º 55
0
def v1_suggest_get(request, response, tfidf, akagraph, query):
    '''Gather suggestions from various engines and within this dossier
    stack instance and filter/rank them before sending to requestor.

    '''
    if not isinstance(query, unicode):
        query = query.decode('utf8')

    config = yakonfig.get_global_config('memex_dossier.models')
    suggest_services = config.get('suggest_services', [])
    session = requests.Session()
    suggestions = []

    akagraph_config = config.get('akagraph')
    if akagraph_config:
        # (86) 13380344114
        #for candidate in phonenumber_matcher(
        #        query, country='CN'):
        #    query = candidate['canonical']
        #    break
        cluster = []
        logger.info('doing query %r', query)
        cc = akagraph.find_connected_component(query, use_soft=False)
        for rec, confidence in cc:
            rec['confidence'] = confidence
            cluster.append(rec)
        suggestions.append(cluster)

    logger.info('querying %d suggest_services', len(suggest_services))
    for url in suggest_services:
        try:
            url = url % dict(query=query)
        except Exception, exc:
            logger.error('failed to insert query=%r into pattern: %r', query,
                         url)
            continue
        try:
            resp = session.get(url, timeout=5)
        except Exception, exc:
            logger.error('failed to retrieve %r', url)
            continue
Exemplo n.º 56
0
def main():
    '''Run the random recommender system on a sequence of topics.
    '''
    description = (
        'A baseline recommender system that uses the truth data to'
        ' create output that has perfect recall and would also have'
        ' perfect precision if you ignore subtopic diversity/novelty.'
        ' This generates output directly from the truth data and'
        ' randomly shuffles the truth data per topic, so that'
        ' the ordering of passages does not attempt to optimize any'
        ' particular quality metric.')
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument('--overwrite', action='store_true')
    args = yakonfig.parse_args(parser, [yakonfig])

    logging.basicConfig(level=logging.DEBUG)

    config = yakonfig.get_global_config('harness')
    batch_size = config.get('batch_size', 5)
    run_file_path = config['run_file_path']
    if os.path.exists(run_file_path):
        if args.overwrite:
            os.remove(run_file_path)
        else:
            sys.exit('%r already exists' % run_file_path)

    kvl_config = {
        'storage_type': 'local',
        'namespace': 'test',
        'app_name': 'test'
    }
    kvl = kvlayer.client(kvl_config)
    label_store = LabelStore(kvl)

    parse_truth_data(label_store, config['truth_data_path'])

    # Set up the system
    doc_store = make_doc_store(label_store)
    system = RandomSystem(doc_store)
    ambassador = HarnessAmbassadorCLI(system, args.config, batch_size)
    ambassador.run()
def test_spinn3r_pipeline_ignore_prefetched(filename, urls, pipeline_config,
                                            output_file):
    """configuration explicitly ignores bad prefetched data"""
    pipeline_config['streamcorpus_pipeline']['from_spinn3r_feed'] = {
        'use_prefetched': False
    }
    with yakonfig.defaulted_config([streamcorpus_pipeline],
                                   config=pipeline_config):
        stages = PipelineStages()
        factory = PipelineFactory(stages)
        pipeline = factory(yakonfig.get_global_config('streamcorpus_pipeline'))
        key = filename
        from_spinn3r_feed._prefetched[key] = 'bogus data, dude!'
        work_unit = SimpleWorkUnit(key)
        work_unit.data['start_chunk_time'] = 0
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)
        del from_spinn3r_feed._prefetched[key]

        with Chunk(path=output_file, mode='rb') as chunk:
            assert [si.abs_url for si in chunk] == urls
Exemplo n.º 58
0
def rejester_run_function(work_unit):
    with yakonfig.defaulted_config([kvlayer, streamcorpus_pipeline],
                                   config=work_unit.spec.get('config', {})):
        scp_config = yakonfig.get_global_config('streamcorpus_pipeline')
        stages = PipelineStages()
        if 'external_stages_path' in scp_config:
            stages.load_external_stages(scp_config['external_stages_path'])
        if 'external_stages_modules' in scp_config:
            for mod in scp_config['external_stages_modules']:
                stages.load_module_stages(mod)
        factory = PipelineFactory(stages)
        pipeline = factory(scp_config)

        work_unit.data['start_chunk_time'] = time.time()
        work_unit.data['start_count'] = 0
        pipeline._process_task(work_unit)

        ## explicitly call cleanup, which is idempotent and might not
        ## get called by atexit if we are running under
        ## multiprocessing
        pipeline.cleanup()