def test_serve_continuation(reader, writer, ledger, pre_served_size): job = Job('somejob', reader, writer, ledger) for i in range(pre_served_size): ledger.set(job.ledger_id(i), 'fake value') items = job.serve(50) n_items = CSV_SIZE - pre_served_size assert len(items) == n_items
def test_receive_cont(job): n_served = 10 n_received = 5 job_name = job.name ledger = job.ledger items = job.serve(n_served) job.receive(items[:n_received], False) reader = CsvReader({'input_file_path': job.reader.file_path}) writer = CsvWriter({'output_file_path': job.writer.file_path}) del job # we processed half the served items and we are making a new job cont_job = Job(job_name, reader, writer, ledger, cont=True) # requesting the second half of the items print('serving') items = cont_job.serve(n_served - n_received) cont_job.receive(items, False) # counting in the ledger received = [ x for x in ledger.scan_iter(f'{job_name}*') if ledger.get(x).decode('utf8') == RECEIVED ] served = [ x for x in ledger.scan_iter(f'{job_name}*') if ledger.get(x).decode('utf8') == SERVED ] assert len(received) == n_served assert len(cont_job.received) == n_served assert len(served) == 0
def test_restart(reader, writer, ledger): jobname = 'somejob' n_skips = 10 for i in range(n_skips): ledger.set(f'{jobname}:{i}', RECEIVED) assert len(list(ledger.scan_iter(f'{jobname}*'))) == n_skips job = Job(jobname, reader, writer, ledger) job.restart() assert not len(list(ledger.scan_iter(f'{jobname}*'))) assert not job.served assert not job.received
def test_restore_records(reader, writer, ledger): job_name: str = 'somejob' n_skips: int = 10 for i in range(n_skips): ledger.set(f'{job_name}:{i}', RECEIVED) ledger.set(f'{job_name}:1', SERVED) job: Job = Job(job_name, reader, writer, ledger) assert len(job.received) == n_skips - 1 assert len(job.served) == 1 items = job.serve(100) assert len(items) == CSV_SIZE - n_skips
def _load_jobs(ledger) -> Dict: jobs: Dict = {} for job_key in ledger.scan_iter('JOB:*'): job_name: str = job_key.decode('utf8').split(':', 1)[1] try: jobs[job_name] = Job.restore_job(job_name, job_key, ledger) OUTPUT_REGISTRY.add(jobs[job_name].writer.file_path) except json.JSONDecodeError: logging.error(f'Could not restore job: {job_key}') except FileNotFoundError as e: logging.error(f'Could not restore job: {job_key}; {e}') return jobs
def test_status(reader, writer, ledger): job = Job('somejob', reader, writer, ledger) assert job.status == IN_PROGRESS items = job.serve(5) assert job.status == IN_PROGRESS items.extend(job.serve(100)) job.receive(items, True) assert job.status == COMPLETE
def test_restore_job(reader, writer, ledger): job_name: str = 'testjob' job_key: str = 'JOB:testjob' metadata: Dict = { 'input_file_path': reader.file_path, 'output_file_path': writer.file_path } value = json.dumps({ 'metadata': metadata, 'reader_name': type(reader).__name__, 'writer_name': type(writer).__name__, 'mode': READ_WRITE }) ledger.set(job_key, value) n_skips: int = 10 for i in range(n_skips): ledger.set(f'{job_name}:{i}', RECEIVED) ledger.set(f'{job_name}:1', SERVED) job: Job = Job.restore_job(job_name, job_key, ledger) assert len(job.received) == n_skips - 1 assert len(job.served) == 1 records = job.serve(100) assert len(records) == CSV_SIZE - n_skips
def writing_job(writer, ledger): job_name = 'writing-job' yield Job(job_name, None, writer, ledger, WRITE_ONLY)
def reading_job(reader, ledger): job_name = 'reading-job' yield Job(job_name, reader, None, ledger, READ_ONLY)
def job(reader, writer, ledger): job_name = 'somejob' yield Job(job_name, reader, writer, ledger)
def test_restore_unknown_job(ledger): job_name: str = 'testjob' job_key: str = 'JOB:testjob' job: NoneType = Job.restore_job(job_name, job_key, ledger) assert job is None
def test_serve_batch(reader, writer, ledger, batch_size): job = Job('somejob', reader, writer, ledger) items = job.serve(batch_size) n_items = batch_size if batch_size < 30 else 30 assert len(items) == n_items
def scramble(job_name: str, metadata: Dict, reader_name: str, writer_name: str, token: Union[str, None] = None, clean_start: bool = False, mode: str = READ_WRITE, cont: bool = False, force_overwrite: bool = False): """ Start a new job. :param job_name: job name :param metadata: I/O classes configuration :param reader_name: class reader name :param writer_name: class writer name :param token: authentication token for the job; default no authentication :param clean_start: clean the items before you start :param mode: I/O mode :param cont: start a repair job :param force_overwrite: force overwrite of the """ logging.info( util.pink(f'SCRAMBLING: name->{job_name}; metadata->{metadata}; ' f'reader_name->{reader_name}; writer_name->{writer_name}; ' f'clean_start->{clean_start}')) reader, writer = _make_io(reader_name, writer_name, metadata) # checking output file validity if not force_overwrite and writer and writer.file_path in OUTPUT_REGISTRY: msg = f'Output path not allowed `{writer.file_path}`.' raise HTTPException(status_code=400, detail=msg) # get job with that name from the ledger existing_job = LEDGER.get(f'JOB:{job_name}') # trying to re-create an existing job if existing_job and (not clean_start and not cont): msg = f'Job {job_name} already exists.' raise HTTPException(status_code=400, detail=msg) # continue where you left off if the job exists if existing_job and cont: job = JOB_LOG[job_name] # don't delete the output file only the records job.clean(output=False) del job del JOB_LOG[job_name] new_job: Job = Job(job_name, reader, writer, LEDGER, mode, cont) # clean ledger before starting if clean_start: new_job.restart() # log the new job JOB_LOG[job_name] = new_job if writer: OUTPUT_REGISTRY.add(writer.file_path) LEDGER.set( f'JOB:{job_name}', json.dumps({ 'metadata': metadata, 'reader_name': reader_name, 'writer_name': writer_name, 'mode': mode })) _add_token(job_name, LEDGER, token)