Пример #1
0
def main():
    parser = argparse.ArgumentParser(
        description='Upload SARS-CoV-19 sequences for analysis')
    parser.add_argument('sequence',
                        type=argparse.FileType('r'),
                        help='sequence FASTA/FASTQ')
    parser.add_argument('metadata',
                        type=argparse.FileType('r'),
                        help='sequence metadata json')
    parser.add_argument("--validate",
                        action="store_true",
                        help="Dry run, validate only")
    args = parser.parse_args()

    api = arvados.api(host=ARVADOS_API_HOST,
                      token=ARVADOS_API_TOKEN,
                      insecure=True)

    try:
        target = qc_fasta(args.sequence)
    except ValueError as e:
        print(e)
        exit(1)

    if not qc_metadata(args.metadata.name):
        print("Failed metadata qc")
        exit(1)

    if args.validate:
        print("Valid")
        exit(0)

    col = arvados.collection.Collection(api_client=api)

    with col.open(target, "w") as f:
        r = args.sequence.read(65536)
        seqlabel = r[1:r.index("\n")]
        print(seqlabel)
        while r:
            f.write(r)
            r = args.sequence.read(65536)
    args.sequence.close()

    print("Reading metadata")
    with col.open("metadata.yaml", "w") as f:
        r = args.metadata.read(65536)
        print(r[0:20])
        while r:
            f.write(r)
            r = args.metadata.read(65536)
    args.metadata.close()

    external_ip = urllib.request.urlopen('https://ident.me').read().decode(
        'utf8')

    try:
        username = getpass.getuser()
    except KeyError:
        username = "******"

    properties = {
        "sequence_label": seqlabel,
        "upload_app": "bh20-seq-uploader",
        "upload_ip": external_ip,
        "upload_user": "******" % (username, socket.gethostname())
    }

    col.save_new(
        owner_uuid=UPLOAD_PROJECT,
        name="%s uploaded by %s from %s" %
        (seqlabel, properties['upload_user'], properties['upload_ip']),
        properties=properties,
        ensure_unique_name=True)

    print("Done")
Пример #2
0
def validate_upload(api, collection, validated_project, fastq_project,
                    fastq_workflow_uuid):
    col = arvados.collection.Collection(collection["uuid"])

    # validate the collection here.  Check metadata, etc.
    valid = True

    if "metadata.yaml" not in col:
        logging.warn("Upload '%s' missing metadata.yaml", collection["name"])
        valid = False
    else:
        try:
            metadata_content = ruamel.yaml.round_trip_load(
                col.open("metadata.yaml"))
            metadata_content[
                "id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection[
                    "portable_data_hash"]
            sample_id = metadata_content["sample"]["sample_id"]
            add_lc_filename(metadata_content, metadata_content["id"])
            valid = qc_metadata(metadata_content) and valid
        except Exception as e:
            logging.warn(e)
            valid = False
        if not valid:
            logging.warn("Failed metadata qc")

    if valid:
        try:
            tgt = None
            paired = {
                "reads_1.fastq": "reads.fastq",
                "reads_1.fastq.gz": "reads.fastq.gz"
            }
            for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz",
                      "reads_1.fastq", "reads_1.fastq.gz"):
                if n not in col:
                    continue
                with col.open(n, 'rb') as qf:
                    tgt = qc_fasta(qf)[0]
                    if tgt != n and tgt != paired.get(n):
                        logging.info(
                            "Expected %s but magic says it should be %s", n,
                            tgt)
                        valid = False
                    elif tgt in ("reads.fastq", "reads.fastq.gz",
                                 "reads_1.fastq", "reads_1.fastq.gz"):
                        start_fastq_to_fasta(api, collection, fastq_project,
                                             fastq_workflow_uuid, n, sample_id)
                        return False
            if tgt is None:
                valid = False
                logging.warn(
                    "Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq",
                    collection["name"])
        except ValueError as v:
            valid = False

    dup = api.collections().list(
        filters=[["owner_uuid", "=", validated_project],
                 ["portable_data_hash", "=",
                  col.portable_data_hash()]]).execute()
    if dup["items"]:
        # This exact collection has been uploaded before.
        valid = False
        logging.warn("Upload '%s' is duplicate" % collection["name"])

    if valid:
        logging.info("Added '%s' to validated sequences" % collection["name"])
        # Move it to the "validated" project to be included in the next analysis
        api.collections().update(
            uuid=collection["uuid"],
            body={
                "owner_uuid":
                validated_project,
                "name":
                "%s (%s)" % (collection["name"], time.asctime(time.gmtime()))
            }).execute()
    else:
        # It is invalid, delete it.
        logging.warn("Suggest deleting '%s'" % collection["name"])
        #api.collections().delete(uuid=collection["uuid"]).execute()

    return valid
Пример #3
0
    def validate_upload(self, collection, revalidate):
        if not revalidate and collection["properties"].get("status") in (
                "validated", "rejected"):
            return False

        with arvados.collection.CollectionReader(
                collection["uuid"],
                api_client=self.api,
                keep_client=self.keepclient) as col:
            # validate the collection here.  Check metadata, etc.
            logging.info("Validating upload '%s' (%s)" %
                         (collection["name"], collection["uuid"]))

            errors = []

            if collection["owner_uuid"] != self.validated_project:
                dup = self.api.collections().list(filters=[[
                    "owner_uuid", "=", self.validated_project
                ], ["portable_data_hash", "=",
                    col.portable_data_hash()]]).execute()
                if dup["items"]:
                    # This exact collection has been uploaded before.
                    errors.append("Duplicate of %s" %
                                  ([d["uuid"] for d in dup["items"]]))

            if not errors:
                if "metadata.yaml" not in col:
                    errors.append("%s missing metadata.yaml" %
                                  collection["name"])
                else:
                    try:
                        with col.open("metadata.yaml") as md:
                            metadata_content = ruamel.yaml.round_trip_load(md)
                        metadata_content[
                            "id"] = "http://covid19.genenetwork.org/resource/%s" % collection[
                                "uuid"]
                        sample_id = metadata_content["sample"]["sample_id"]
                        add_lc_filename(metadata_content,
                                        metadata_content["id"])
                        valid = qc_metadata(metadata_content)
                        if not valid:
                            errors.append("Failed metadata qc")
                    except Exception as e:
                        errors.append(str(e))

            existing = self.api.collections().list(filters=[[
                "owner_uuid", "=", self.validated_project
            ], ["properties.sequence_label", "=", sample_id]]).execute()

            if not errors:
                try:
                    tgt = None
                    paired = {
                        "reads_1.fastq": "reads.fastq",
                        "reads_1.fastq.gz": "reads.fastq.gz"
                    }
                    for n in ("sequence.fasta", "reads.fastq",
                              "reads.fastq.gz", "reads_1.fastq",
                              "reads_1.fastq.gz"):
                        if n not in col:
                            continue
                        with col.open(n, 'rb') as qf:
                            tgt, seqlabel, seq_type = qc_fasta(qf)
                            if tgt != n and tgt != paired.get(n):
                                errors.append(
                                    "Expected %s but magic says it should be %s"
                                    % (n, tgt))
                            elif tgt in ("reads.fastq", "reads.fastq.gz",
                                         "reads_1.fastq", "reads_1.fastq.gz"):
                                self.start_fastq_to_fasta(
                                    collection, n, sample_id)
                                return False

                            # If it is a FASTA
                            if sample_id != seqlabel:
                                errors.append(
                                    "Expected sample_id == seqlabel, but %s != %s"
                                    % (sample_id, seqlabel))
                    if tgt is None and len(existing["items"]) == 0:
                        errors.append(
                            "Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq"
                            % collection["name"])
                except Exception as v:
                    errors.append(str(v))

            if errors:
                # It is invalid
                logging.warn("'%s' (%s) has validation errors: %s" %
                             (collection["name"], collection["uuid"],
                              "\n".join(errors)))
                collection["properties"]["status"] = "rejected"
                collection["properties"]["errors"] = errors
                self.api.collections().update(uuid=collection["uuid"],
                                              body={
                                                  "properties":
                                                  collection["properties"]
                                              }).execute()
                return False

            update_from = None
            if existing["items"]:
                # "collection" is the newly uploaded one we're looking at
                update_from = collection
                collection = existing["items"][0]
                collection["properties"] = update_from["properties"]

            if "errors" in collection["properties"]:
                del collection["properties"]["errors"]
            collection["properties"]["status"] = "validated"
            collection["properties"]["sequence_label"] = sample_id

            if update_from:
                with arvados.collection.Collection(
                        collection["uuid"],
                        api_client=self.api,
                        keep_client=self.keepclient) as update_existing_col:
                    update_existing_col.copy("metadata.yaml",
                                             "metadata.yaml",
                                             source_collection=col,
                                             overwrite=True)
                    update_existing_col.save(
                        properties=collection["properties"])
                self.api.collections().delete(
                    uuid=update_from["uuid"]).execute()
                logging.info("Updated '%s' in validated sequences" %
                             collection["name"])
            else:
                # Move it to the "validated" project to be included in the next analysis
                self.api.collections().update(
                    uuid=collection["uuid"],
                    body={
                        "owner_uuid":
                        self.validated_project,
                        "name":
                        "%s (%s)" %
                        (collection["name"], time.asctime(time.gmtime())),
                        "properties":
                        collection["properties"]
                    }).execute()
                logging.info("Added '%s' to validated sequences" %
                             collection["name"])

            return True