Пример #1
0
def watch(flow_name="oxforduni-gpas-sars-cov2-illumina"):
    apex_token = None
    apex_token_time = 0
    config = utils.load_oracle_config("config.json")

    while True:
        # get a new token every 5 hours (& startup)
        time_now = int(time.time())
        apex_token_age = time_now - apex_token_time
        if apex_token_age > 5 * 60 * 60:
            logging.info(f"Acquiring new token (token age: {apex_token_age}s)")
            apex_token = db.get_apex_token()
            apex_token_time = time_now

        # new runs to submit are sp3 runs that have finished with status OK
        # minus runs that have been marked as already submitted
        finished_ok_sp3_runs = set(get_finished_ok_sp3_runs(flow_name))
        submitted_runs = set(get_submitted_runlist(flow_name))
        new_runs_to_submit = finished_ok_sp3_runs.difference(submitted_runs)

        for new_run_uuid in new_runs_to_submit:
            logging.info(f"new run: {new_run_uuid}")
            process_run(new_run_uuid, config, apex_token)
            add_to_submitted_runlist(flow_name, new_run_uuid)

        logging.info("sleeping for 60")
        time.sleep(60)
Пример #2
0
def submit_sample_data_error(
    error_str="Error: unspecified error",
    apex_database_sample_name=None,
    sp3_sample_name=None,
    sp3_run_uuid=None,
    apex_token=None,
    config=None,
):
    """Given a oracle sample id or sp3 sample name, submit sample error status."""

    if not apex_database_sample_name and not (sp3_sample_name and sp3_run_uuid):
        logging.error(
            "submit_sample_data_error: You need to provide either the oracle sample id or the sp3 sample name"
        )
        return
    if sp3_sample_name and sp3_run_uuid:
        m = get_sample_map_for_run(sp3_run_uuid)
        if not m:
            logging.error("submit_sample_data_error: couldn't get map for sample")
            return
        apex_database_sample_name = m.get(sp3_sample_name)
        if not apex_database_sample_name:
            logging.error(
                "submit_sample_data_error: couldn't find {sp3_sample_name} in map"
            )
            return

    if not apex_token:
        apex_token = db.get_apex_token()
    if not config:
        config = utils.load_oracle_config("config.json")

    if type(error_str) != str:
        # just in case
        error_str = str(error_str)

    data = {
        "sample": {
            "operations": [
                {
                    "op": "replace",
                    "path": "errorMsg",
                    "value": error_str,
                },
                {"op": "replace", "path": "status", "value": "Error"},
            ]
        }
    }

    sample_data_response = requests.put(
        f"{config['host']}/samples/{apex_database_sample_name}",
        headers={"Authorization": f"Bearer {apex_token}"},
        json=data,
    )
    logging.info(f"POSTing error to {config['host']}/samples/{apex_database_sample_name}")
    return sample_data_response.text
Пример #3
0
def watch(
    watch_dir="/data/inputs/s3/oracle-test",
    bucket_name="catsup-test",
    max_submission_attempts=3,
    flow="ncov2019-artic-nf"
):
    """
    watch watch_dir for new directories that have the upload_done.txt file (signaling that an upload was successful)

    watch_dir example: /data/inputs/s3/oracle-test (for the catsup-test bucket. In the future we should probably name the directories the same as the bucket name!
    bucket_name: the bucket name that's mounted in the watch_dir directory (used by the pipeline to fetch the sample files)
    flow: currently a choice between ncov2019-artic-nf and sars-cov2_workflows
    """
    print(doc)
    watch_dir = Path(watch_dir)
    if not watch_dir.is_dir():
        logging.error(f"{watch_dir} is not a directory")
        sys.exit(1)

    while True:
        # get all directories in bucket
        # note that directories are named after submission uuids, so this is effectively a list of submission uuids
        candidate_dirs = set([x.name for x in watch_dir.glob("*") if x.is_dir()])
        # get directories/submissions that have already been processed
        cached_dirlist = set(get_cached_dirlist(str(watch_dir)))
        # get directories/submissions that have failed
        bad_submission_uuids = set(get_ignore_list(str(watch_dir)))
        # submissions to be processed are those that are new and have not beek marked as failed
        new_dirs = candidate_dirs.difference(cached_dirlist)
        new_dirs = new_dirs.difference(bad_submission_uuids)

        if new_dirs:
            apex_token = db.get_apex_token()
        for new_dir in new_dirs:  #  new_dir is the catsup upload uuid
            r = process_dir(
                new_dir, watch_dir, bucket_name, apex_token, max_submission_attempts, flow
            )
            if r:
                # if we've started a run then stop processing and go to sleep. This prevents
                # the system from being overwhelmed with nextflow starting
                break

        print("sleeping for 60")
        time.sleep(60)
Пример #4
0
    dirlist = mydb["dirlist"]
    ignore_list = mydb["ignore_list"]

    print(f"removing {new_dir} from mongo dirlist ignore_list of {watch_dir}")
    dirlist.update_one({"watch_dir": watch_dir}, {"$pull": {
        "dirs": new_dir
    }},
                       upsert=True)
    ignore_list.update_one({"watch_dir": watch_dir},
                           {"$pull": {
                               "ignore_list": new_dir
                           }},
                           upsert=True)


apex_token = db.get_apex_token()
headers = {"Authorization": f"Bearer {apex_token}"}

# Get organisation to lookup input buckets
orgs = set()
url = "https://portal.dev.gpas.ox.ac.uk/ords/gpasdevpdb1/grsp/sp3/organisations"
response = requests.get(url, headers=headers).json()

org_buckets = dict()
for org in response['items']:
    org_buckets[org['organisationName']] = org['inputBucketName']

url = "https://portal.dev.gpas.ox.ac.uk/ords/gpasdevpdb1/grsp/sp3/batches_by_status/Uploaded"
response = requests.get(url, headers=headers).json()
found_batches = []
Пример #5
0
def process_batch(sample_method, samples_to_submit, batch_dir, workflow):
    print(f"processing {samples_to_submit}")
    samples = list()
    sample_shards = dict()
    batch_name = "ENA-" + str(uuid.uuid4())[:7]
    submission_name = f"Entry for ENA sample processing - {batch_name}"

    for sample, ena_metadata in samples_to_submit:
        p = {
            "name": sample.name,
            "tags": ["ENA_Data"],
            "submissionTitle": submission_name,
            "submissionDescription": submission_name,
            "control": ena_metadata["control"],
            "collection_date": ena_metadata["collection_date"],
            "status": "Uploaded",
            "country": ena_metadata["country"],
            "region": ena_metadata["region"],
            "district": ena_metadata["district"],
            "specimen": ena_metadata["specimen_organism"],
            "host": ena_metadata["host"],
            "instrument": {"platform": ena_metadata["instrument_platform"],},
            "primer_scheme": ena_metadata["primer_scheme"],
        }
    
        if sample_method.name == "illumina":
            p["peReads"] = [
                {
                    "r1_uri": str(Path(sample) / (sample.name + ".reads_1.fastq.gz")),
                    "r1_md5": get_md5_file_hash(str(Path(sample) / (sample.name + ".reads_1.fastq.gz"))),
                    "r2_uri": str(Path(sample) / (sample.name + ".reads_2.fastq.gz")),
                    "r2_md5": get_md5_file_hash(str(Path(sample) / (sample.name + ".reads_2.fastq.gz"))),
                }
            ]
            p["seReads"] = []
        elif sample_method.name == "nanopore":
            p["seReads"] = [
                {
                    "uri": str(Path(sample) / (sample.name + ".reads.fastq.gz")),
                    "md5": get_md5_file_hash(str(Path(sample) / (sample.name + ".reads.fastq.gz"))),
                }
            ]
            p["peReads"] = []
        else:
            logging.error(f"Invalid sample_method {sample_method}")
        samples.append(p)
        # Add to dict for submitting to cached_dirs
        path = sample.relative_to(sample_method)
        path = str(path.parent)
        if path in sample_shards:
            sample_shards[path].append(sample.name)
        else:
            sample_shards[path] = [sample.name]

    submission = {
        "batch": {
            "fileName": batch_name,
            "bucketName": sample_method.parent.parent.name,
            "organisation": "Public Repository Data",
            "site": "ENA Data",
            "uploadedOn": datetime.datetime.now().isoformat()[:-3] + "Z",
            # "uploadedBy": "*****@*****.**",
            "uploadedBy": config["ENA_user"],
            "samples": samples,
        }
    }

    apex_token = db.get_apex_token()
    apex_batch, apex_samples = db.post_metadata_to_apex(submission, apex_token)
    upload_bucket = db.get_output_bucket_from_input(
        sample_method.parent.parent.name, apex_token
    )

    for path, sample_list in sample_shards.items():
        add_to_cached_dirlist(sample_method.name, path, sample_list)

    # Add to batch_dir
    ena_batch_csv = Path(batch_dir) / f"{batch_name}.csv"
    out_fieldnames = ["bucket", "sample_prefix", "sample_accession"]
    with open(ena_batch_csv, "w") as out_csv:
        writer1 = csv.DictWriter(out_csv, fieldnames=out_fieldnames)
        writer1.writeheader()
        for sample, ena_metadata in samples_to_submit:
            out = {
                "bucket": submission["batch"]["bucketName"],
                "sample_prefix": str(
                    sample.relative_to(Path("/data/inputs/s3/") / submission["batch"]["bucketName"])
                )
                + "/",
                "sample_accession": sample.name,
            }
            writer1.writerow(out)
    
    if str(workflow).lower() == "sars-cov2_workflows":
        flow = f"oxforduni-gpas-sars-cov2-{sample_method.name}"
    else:
        flow = f"oxforduni-ncov2019-artic-nf-{sample_method.name}"

    ret = catsgo.run_covid_ena(
        flow,
        str(ena_batch_csv),
        batch_name,
        upload_bucket,
    )

    dirwatcher_metadata.update_one(
        {"catsup_uuid": batch_name},
        {
            "$set": {
                "run_uuid": ret.get("run_uuid", ""),
                "added_time": str(int(time.time())),
                "apex_batch": apex_batch,
                "apex_samples": apex_samples,
                "submitted_metadata": samples,
            }
        },
        upsert=True,
    )

    return []
Пример #6
0
def get_apex_token():
    return db.get_apex_token()