def test_pipeline(request, test_data_dir): filename=str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename): ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline stages = PipelineStages() pf = PipelineFactory(stages) p = pf(yakonfig.get_global_config('streamcorpus_pipeline')) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SystemExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) logger.debug('now joining...') timeout = gevent.Timeout(1) g.join(timeout=timeout)
def make_hyperlink_labeled_test_chunk(tmpdir): ''' returns a path to a temporary chunk that has been hyperlink labeled ''' tpath = tmpdir.join(str(uuid.uuid1()) + '.sc') o_chunk = Chunk(tpath, mode='wb') ipath = get_test_chunk_path() hl = hyperlink_labels(config={ 'require_abs_url': True, 'all_domains': True, 'offset_types': [BYTES], }) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath
def test_pipeline(request, test_data_dir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename): ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline stages = PipelineStages() pf = PipelineFactory(stages) p = pf(yakonfig.get_global_config('streamcorpus_pipeline')) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SystemExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) logger.debug('now joining...') timeout = gevent.Timeout(1) g.join(timeout=timeout)
def test_upgrade_streamcorpus_v0_3_0(test_data_dir): up = upgrade_streamcorpus_v0_3_0(config={}) count = 0 for si in streamcorpus.Chunk(get_test_chunk_path(test_data_dir), message=streamcorpus.StreamItem_v0_2_0): count += 1 si3 = up(si) assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES["v0_3_0"] if count > 10: break
def test_dedup_chunk_counts(request, test_data_dir, tmpdir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename, config={'tmp_dir_path': str(tmpdir)} ) as config: ## run the pipeline pf = PipelineFactory(PipelineStages()) p = pf(config['streamcorpus_pipeline']) p.run(get_test_chunk_path(test_data_dir))
def test_dedup_chunk_counts(request, test_data_dir, tmpdir): filename = str(request.fspath.dirpath('test_dedup_chunk_counts.yaml')) with yakonfig.defaulted_config([streamcorpus_pipeline], filename=filename, config={'tmp_dir_path': str(tmpdir)}) as config: ## run the pipeline pf = PipelineFactory(PipelineStages()) p = pf(config['streamcorpus_pipeline']) p.run(get_test_chunk_path(test_data_dir))
def test_post_batch_incremental_stage(request, test_data_dir): path = os.path.dirname(__file__) config = yaml.load(open(os.path.join(path, 'test_post_batch_incremental.yaml'))) ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline p = Pipeline( config ) p.run()
def test_upgrade_streamcorpus_v0_3_0(test_data_dir): up = upgrade_streamcorpus_v0_3_0(config={}) count = 0 for si in streamcorpus.Chunk(get_test_chunk_path(test_data_dir), message=streamcorpus.StreamItem_v0_2_0): count += 1 si3 = up(si) assert si3.version == streamcorpus.Versions._NAMES_TO_VALUES['v0_3_0'] if count > 10: break
def test_post_batch_incremental_stage(request, test_data_dir): path = os.path.dirname(__file__) config = yaml.load( open(os.path.join(path, 'test_post_batch_incremental.yaml'))) ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path(test_data_dir)) ## run the pipeline p = Pipeline(config) p.run()
def make_hyperlink_labeled_test_chunk(tmpdir): """ returns a path to a temporary chunk that has been hyperlink labeled """ tpath = tmpdir.join(str(uuid.uuid1()) + ".sc") o_chunk = Chunk(tpath, mode="wb") ipath = get_test_chunk_path() hl = hyperlink_labels(config={"require_abs_url": True, "all_domains": True, "offset_types": [BYTES]}) cv = make_clean_visible(config={}) for si in Chunk(path=ipath, message=streamcorpus.StreamItem_v0_2_0): ## clear out existing labels and tokens si.body.labels = {} si.body.sentences = {} context = {} hl(si, context) cv(si, context) o_chunk.add(si) o_chunk.close() return tpath