def test_pipeline(monkeypatch): def mockexit(status=0): log( ' sys.exit(%d) ' % status ) raise SuccessfulExit() monkeypatch.setattr(sys, 'exit', mockexit) path = os.path.dirname(__file__) config = yaml.load(open(os.path.join(path, 'test_dedup_chunk_counts.yaml'))) ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path()) ## run the pipeline p = Pipeline( config ) from streamcorpus_pipeline.run import SimpleWorkUnit work_unit = SimpleWorkUnit('long string indicating source of text') work_unit.data['start_chunk_time'] = time.time() work_unit.data['start_count'] = 0 g = gevent.spawn(p._process_task, work_unit) gevent.sleep(5) with pytest.raises(SuccessfulExit): # pylint: disable=E1101 p.shutdown(sig=signal.SIGTERM) log( 'now joining...' ) timeout = gevent.Timeout(1) g.join(timeout=timeout)
def test_dedup_chunk_counts(): path = os.path.dirname(__file__) config = yaml.load(open(os.path.join(path, 'test_dedup_chunk_counts.yaml'))) ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_chunk_path()) ## run the pipeline p = Pipeline( config ) p.run()
def test_align_serif_stage(): path = os.path.dirname(__file__) config = yaml.load(open(os.path.join(path, 'test_align_serif_stage.yaml'))) ## config says read from stdin, so make that have what we want stdin = sys.stdin sys.stdin = StringIO(get_test_v0_3_0_chunk_tagged_by_serif_path()) ## run the pipeline p = Pipeline( config ) p.run()