def watch(flow_name="oxforduni-gpas-sars-cov2-illumina"): apex_token = None apex_token_time = 0 config = utils.load_oracle_config("config.json") while True: # get a new token every 5 hours (& startup) time_now = int(time.time()) apex_token_age = time_now - apex_token_time if apex_token_age > 5 * 60 * 60: logging.info(f"Acquiring new token (token age: {apex_token_age}s)") apex_token = db.get_apex_token() apex_token_time = time_now # new runs to submit are sp3 runs that have finished with status OK # minus runs that have been marked as already submitted finished_ok_sp3_runs = set(get_finished_ok_sp3_runs(flow_name)) submitted_runs = set(get_submitted_runlist(flow_name)) new_runs_to_submit = finished_ok_sp3_runs.difference(submitted_runs) for new_run_uuid in new_runs_to_submit: logging.info(f"new run: {new_run_uuid}") process_run(new_run_uuid, config, apex_token) add_to_submitted_runlist(flow_name, new_run_uuid) logging.info("sleeping for 60") time.sleep(60)
def submit_sample_data_error( error_str="Error: unspecified error", apex_database_sample_name=None, sp3_sample_name=None, sp3_run_uuid=None, apex_token=None, config=None, ): """Given a oracle sample id or sp3 sample name, submit sample error status.""" if not apex_database_sample_name and not (sp3_sample_name and sp3_run_uuid): logging.error( "submit_sample_data_error: You need to provide either the oracle sample id or the sp3 sample name" ) return if sp3_sample_name and sp3_run_uuid: m = get_sample_map_for_run(sp3_run_uuid) if not m: logging.error("submit_sample_data_error: couldn't get map for sample") return apex_database_sample_name = m.get(sp3_sample_name) if not apex_database_sample_name: logging.error( "submit_sample_data_error: couldn't find {sp3_sample_name} in map" ) return if not apex_token: apex_token = db.get_apex_token() if not config: config = utils.load_oracle_config("config.json") if type(error_str) != str: # just in case error_str = str(error_str) data = { "sample": { "operations": [ { "op": "replace", "path": "errorMsg", "value": error_str, }, {"op": "replace", "path": "status", "value": "Error"}, ] } } sample_data_response = requests.put( f"{config['host']}/samples/{apex_database_sample_name}", headers={"Authorization": f"Bearer {apex_token}"}, json=data, ) logging.info(f"POSTing error to {config['host']}/samples/{apex_database_sample_name}") return sample_data_response.text
def watch( watch_dir="/data/inputs/s3/oracle-test", bucket_name="catsup-test", max_submission_attempts=3, flow="ncov2019-artic-nf" ): """ watch watch_dir for new directories that have the upload_done.txt file (signaling that an upload was successful) watch_dir example: /data/inputs/s3/oracle-test (for the catsup-test bucket. In the future we should probably name the directories the same as the bucket name! bucket_name: the bucket name that's mounted in the watch_dir directory (used by the pipeline to fetch the sample files) flow: currently a choice between ncov2019-artic-nf and sars-cov2_workflows """ print(doc) watch_dir = Path(watch_dir) if not watch_dir.is_dir(): logging.error(f"{watch_dir} is not a directory") sys.exit(1) while True: # get all directories in bucket # note that directories are named after submission uuids, so this is effectively a list of submission uuids candidate_dirs = set([x.name for x in watch_dir.glob("*") if x.is_dir()]) # get directories/submissions that have already been processed cached_dirlist = set(get_cached_dirlist(str(watch_dir))) # get directories/submissions that have failed bad_submission_uuids = set(get_ignore_list(str(watch_dir))) # submissions to be processed are those that are new and have not beek marked as failed new_dirs = candidate_dirs.difference(cached_dirlist) new_dirs = new_dirs.difference(bad_submission_uuids) if new_dirs: apex_token = db.get_apex_token() for new_dir in new_dirs: # new_dir is the catsup upload uuid r = process_dir( new_dir, watch_dir, bucket_name, apex_token, max_submission_attempts, flow ) if r: # if we've started a run then stop processing and go to sleep. This prevents # the system from being overwhelmed with nextflow starting break print("sleeping for 60") time.sleep(60)
dirlist = mydb["dirlist"] ignore_list = mydb["ignore_list"] print(f"removing {new_dir} from mongo dirlist ignore_list of {watch_dir}") dirlist.update_one({"watch_dir": watch_dir}, {"$pull": { "dirs": new_dir }}, upsert=True) ignore_list.update_one({"watch_dir": watch_dir}, {"$pull": { "ignore_list": new_dir }}, upsert=True) apex_token = db.get_apex_token() headers = {"Authorization": f"Bearer {apex_token}"} # Get organisation to lookup input buckets orgs = set() url = "https://portal.dev.gpas.ox.ac.uk/ords/gpasdevpdb1/grsp/sp3/organisations" response = requests.get(url, headers=headers).json() org_buckets = dict() for org in response['items']: org_buckets[org['organisationName']] = org['inputBucketName'] url = "https://portal.dev.gpas.ox.ac.uk/ords/gpasdevpdb1/grsp/sp3/batches_by_status/Uploaded" response = requests.get(url, headers=headers).json() found_batches = []
def process_batch(sample_method, samples_to_submit, batch_dir, workflow): print(f"processing {samples_to_submit}") samples = list() sample_shards = dict() batch_name = "ENA-" + str(uuid.uuid4())[:7] submission_name = f"Entry for ENA sample processing - {batch_name}" for sample, ena_metadata in samples_to_submit: p = { "name": sample.name, "tags": ["ENA_Data"], "submissionTitle": submission_name, "submissionDescription": submission_name, "control": ena_metadata["control"], "collection_date": ena_metadata["collection_date"], "status": "Uploaded", "country": ena_metadata["country"], "region": ena_metadata["region"], "district": ena_metadata["district"], "specimen": ena_metadata["specimen_organism"], "host": ena_metadata["host"], "instrument": {"platform": ena_metadata["instrument_platform"],}, "primer_scheme": ena_metadata["primer_scheme"], } if sample_method.name == "illumina": p["peReads"] = [ { "r1_uri": str(Path(sample) / (sample.name + ".reads_1.fastq.gz")), "r1_md5": get_md5_file_hash(str(Path(sample) / (sample.name + ".reads_1.fastq.gz"))), "r2_uri": str(Path(sample) / (sample.name + ".reads_2.fastq.gz")), "r2_md5": get_md5_file_hash(str(Path(sample) / (sample.name + ".reads_2.fastq.gz"))), } ] p["seReads"] = [] elif sample_method.name == "nanopore": p["seReads"] = [ { "uri": str(Path(sample) / (sample.name + ".reads.fastq.gz")), "md5": get_md5_file_hash(str(Path(sample) / (sample.name + ".reads.fastq.gz"))), } ] p["peReads"] = [] else: logging.error(f"Invalid sample_method {sample_method}") samples.append(p) # Add to dict for submitting to cached_dirs path = sample.relative_to(sample_method) path = str(path.parent) if path in sample_shards: sample_shards[path].append(sample.name) else: sample_shards[path] = [sample.name] submission = { "batch": { "fileName": batch_name, "bucketName": sample_method.parent.parent.name, "organisation": "Public Repository Data", "site": "ENA Data", "uploadedOn": datetime.datetime.now().isoformat()[:-3] + "Z", # "uploadedBy": "*****@*****.**", "uploadedBy": config["ENA_user"], "samples": samples, } } apex_token = db.get_apex_token() apex_batch, apex_samples = db.post_metadata_to_apex(submission, apex_token) upload_bucket = db.get_output_bucket_from_input( sample_method.parent.parent.name, apex_token ) for path, sample_list in sample_shards.items(): add_to_cached_dirlist(sample_method.name, path, sample_list) # Add to batch_dir ena_batch_csv = Path(batch_dir) / f"{batch_name}.csv" out_fieldnames = ["bucket", "sample_prefix", "sample_accession"] with open(ena_batch_csv, "w") as out_csv: writer1 = csv.DictWriter(out_csv, fieldnames=out_fieldnames) writer1.writeheader() for sample, ena_metadata in samples_to_submit: out = { "bucket": submission["batch"]["bucketName"], "sample_prefix": str( sample.relative_to(Path("/data/inputs/s3/") / submission["batch"]["bucketName"]) ) + "/", "sample_accession": sample.name, } writer1.writerow(out) if str(workflow).lower() == "sars-cov2_workflows": flow = f"oxforduni-gpas-sars-cov2-{sample_method.name}" else: flow = f"oxforduni-ncov2019-artic-nf-{sample_method.name}" ret = catsgo.run_covid_ena( flow, str(ena_batch_csv), batch_name, upload_bucket, ) dirwatcher_metadata.update_one( {"catsup_uuid": batch_name}, { "$set": { "run_uuid": ret.get("run_uuid", ""), "added_time": str(int(time.time())), "apex_batch": apex_batch, "apex_samples": apex_samples, "submitted_metadata": samples, } }, upsert=True, ) return []
def get_apex_token(): return db.get_apex_token()