def qa_stuff(metadata, sequence_p1, sequence_p2): try: log.debug("Checking metadata") if not qc_metadata(metadata.name): log.warning("Failed metadata qc") exit(1) except ValueError as e: log.debug(e) log.debug("Failed metadata qc") print(e) exit(1) target = [] try: log.debug("Checking FASTA/FASTQ QC") target.append(qc_fasta(sequence_p1)) if sequence_p2: target.append(qc_fasta(sequence_p2)) target[0] = ("reads_1." + target[0][0][6:], target[0][1]) target[1] = ("reads_2." + target[1][0][6:], target[0][1]) except ValueError as e: log.debug(e) log.debug("Failed FASTA qc") print(e) exit(1) return target
def qc_stuff(metadata, sequence_p1, sequence_p2, do_qc=True): """Quality control. Essentially it checks the RDF schema and the FASTA sequence for enough overlap with the reference genome """ failed = False sample_id = '' try: log.debug("Checking metadata" if do_qc else "Skipping metadata check") if do_qc: sample_id = qc_metadata(metadata.name) if not sample_id: log.warning("Failed metadata QC") failed = True except Exception as e: log.exception("Failed metadata QC") failed = True # continue with the FASTA checker target = [] if sequence_p1: try: log.debug("FASTA/FASTQ QC" if do_qc else "Limited FASTA/FASTQ QC") target.append(qc_fasta(sequence_p1, check_with_mimimap2=do_qc)) if sequence_p2: if target[0][2] == 'text/fasta': raise ValueError( "It is possible to upload just one FASTA file at a time" ) target.append(qc_fasta(sequence_p2)) target[0] = ("reads_1." + target[0][0][6:], target[0][1], target[0][2]) target[1] = ("reads_2." + target[1][0][6:], target[1][1], target[1][2]) if do_qc and target[0][ 2] == 'text/fasta' and sample_id != target[0][1]: raise ValueError( f"The sample_id field in the metadata ({sample_id}) must be the same as the FASTA header ({target[0][1]})" ) except Exception as e: log.exception("Failed sequence QC") failed = True if failed: log.debug("Bailing out!") exit(1) return target
def main(): parser = argparse.ArgumentParser(description='Upload SARS-CoV-19 sequences for analysis') parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ') parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json') args = parser.parse_args() api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) target = qc_fasta(args.sequence) if not qc_metadata(args.metadata.name): print("Failed metadata qc") exit(1) col = arvados.collection.Collection(api_client=api) with col.open(target, "w") as f: r = args.sequence.read(65536) print(r[0:20]) while r: f.write(r) r = args.sequence.read(65536) args.sequence.close() print("Reading metadata") with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: f.write(r) r = args.metadata.read(65536) args.metadata.close() external_ip = urllib.request.urlopen('https://ident.me').read().decode('utf8') properties = { "upload_app": "bh20-seq-uploader", "upload_ip": external_ip, "upload_user": "******" % (getpass.getuser(), socket.gethostname()) } col.save_new(owner_uuid=UPLOAD_PROJECT, name="Uploaded by %s from %s" % (properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) print("Done")
def validate_upload(api, collection, validated_project, fastq_project, fastq_workflow_uuid): col = arvados.collection.Collection(collection["uuid"]) # validate the collection here. Check metadata, etc. valid = True if "metadata.yaml" not in col: logging.warn("Upload '%s' missing metadata.yaml", collection["name"]) valid = False else: try: metadata_content = ruamel.yaml.round_trip_load( col.open("metadata.yaml")) metadata_content[ "id"] = "http://arvados.org/keep:%s/metadata.yaml" % collection[ "portable_data_hash"] sample_id = metadata_content["sample"]["sample_id"] add_lc_filename(metadata_content, metadata_content["id"]) valid = qc_metadata(metadata_content) and valid except Exception as e: logging.warn(e) valid = False if not valid: logging.warn("Failed metadata qc") if valid: try: tgt = None paired = { "reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz" } for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): if n not in col: continue with col.open(n, 'rb') as qf: tgt = qc_fasta(qf)[0] if tgt != n and tgt != paired.get(n): logging.info( "Expected %s but magic says it should be %s", n, tgt) valid = False elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): start_fastq_to_fasta(api, collection, fastq_project, fastq_workflow_uuid, n, sample_id) return False if tgt is None: valid = False logging.warn( "Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq", collection["name"]) except ValueError as v: valid = False dup = api.collections().list( filters=[["owner_uuid", "=", validated_project], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() if dup["items"]: # This exact collection has been uploaded before. valid = False logging.warn("Upload '%s' is duplicate" % collection["name"]) if valid: logging.info("Added '%s' to validated sequences" % collection["name"]) # Move it to the "validated" project to be included in the next analysis api.collections().update( uuid=collection["uuid"], body={ "owner_uuid": validated_project, "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())) }).execute() else: # It is invalid, delete it. logging.warn("Suggest deleting '%s'" % collection["name"]) #api.collections().delete(uuid=collection["uuid"]).execute() return valid
def main(): parser = argparse.ArgumentParser( description='Upload SARS-CoV-19 sequences for analysis') parser.add_argument('sequence', type=argparse.FileType('r'), help='sequence FASTA/FASTQ') parser.add_argument('metadata', type=argparse.FileType('r'), help='sequence metadata json') parser.add_argument("--validate", action="store_true", help="Dry run, validate only") args = parser.parse_args() api = arvados.api(host=ARVADOS_API_HOST, token=ARVADOS_API_TOKEN, insecure=True) try: target = qc_fasta(args.sequence) except ValueError as e: print(e) exit(1) if not qc_metadata(args.metadata.name): print("Failed metadata qc") exit(1) if args.validate: print("Valid") exit(0) col = arvados.collection.Collection(api_client=api) with col.open(target, "w") as f: r = args.sequence.read(65536) seqlabel = r[1:r.index("\n")] print(seqlabel) while r: f.write(r) r = args.sequence.read(65536) args.sequence.close() print("Reading metadata") with col.open("metadata.yaml", "w") as f: r = args.metadata.read(65536) print(r[0:20]) while r: f.write(r) r = args.metadata.read(65536) args.metadata.close() external_ip = urllib.request.urlopen('https://ident.me').read().decode( 'utf8') try: username = getpass.getuser() except KeyError: username = "******" properties = { "sequence_label": seqlabel, "upload_app": "bh20-seq-uploader", "upload_ip": external_ip, "upload_user": "******" % (username, socket.gethostname()) } col.save_new( owner_uuid=UPLOAD_PROJECT, name="%s uploaded by %s from %s" % (seqlabel, properties['upload_user'], properties['upload_ip']), properties=properties, ensure_unique_name=True) print("Done")
def validate_upload(self, collection, revalidate): if not revalidate and collection["properties"].get("status") in ( "validated", "rejected"): return False with arvados.collection.CollectionReader( collection["uuid"], api_client=self.api, keep_client=self.keepclient) as col: # validate the collection here. Check metadata, etc. logging.info("Validating upload '%s' (%s)" % (collection["name"], collection["uuid"])) errors = [] if collection["owner_uuid"] != self.validated_project: dup = self.api.collections().list(filters=[[ "owner_uuid", "=", self.validated_project ], ["portable_data_hash", "=", col.portable_data_hash()]]).execute() if dup["items"]: # This exact collection has been uploaded before. errors.append("Duplicate of %s" % ([d["uuid"] for d in dup["items"]])) if not errors: if "metadata.yaml" not in col: errors.append("%s missing metadata.yaml" % collection["name"]) else: try: with col.open("metadata.yaml") as md: metadata_content = ruamel.yaml.round_trip_load(md) metadata_content[ "id"] = "http://covid19.genenetwork.org/resource/%s" % collection[ "uuid"] sample_id = metadata_content["sample"]["sample_id"] add_lc_filename(metadata_content, metadata_content["id"]) valid = qc_metadata(metadata_content) if not valid: errors.append("Failed metadata qc") except Exception as e: errors.append(str(e)) existing = self.api.collections().list(filters=[[ "owner_uuid", "=", self.validated_project ], ["properties.sequence_label", "=", sample_id]]).execute() if not errors: try: tgt = None paired = { "reads_1.fastq": "reads.fastq", "reads_1.fastq.gz": "reads.fastq.gz" } for n in ("sequence.fasta", "reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): if n not in col: continue with col.open(n, 'rb') as qf: tgt, seqlabel, seq_type = qc_fasta(qf) if tgt != n and tgt != paired.get(n): errors.append( "Expected %s but magic says it should be %s" % (n, tgt)) elif tgt in ("reads.fastq", "reads.fastq.gz", "reads_1.fastq", "reads_1.fastq.gz"): self.start_fastq_to_fasta( collection, n, sample_id) return False # If it is a FASTA if sample_id != seqlabel: errors.append( "Expected sample_id == seqlabel, but %s != %s" % (sample_id, seqlabel)) if tgt is None and len(existing["items"]) == 0: errors.append( "Upload '%s' does not contain sequence.fasta, reads.fastq or reads_1.fastq" % collection["name"]) except Exception as v: errors.append(str(v)) if errors: # It is invalid logging.warn("'%s' (%s) has validation errors: %s" % (collection["name"], collection["uuid"], "\n".join(errors))) collection["properties"]["status"] = "rejected" collection["properties"]["errors"] = errors self.api.collections().update(uuid=collection["uuid"], body={ "properties": collection["properties"] }).execute() return False update_from = None if existing["items"]: # "collection" is the newly uploaded one we're looking at update_from = collection collection = existing["items"][0] collection["properties"] = update_from["properties"] if "errors" in collection["properties"]: del collection["properties"]["errors"] collection["properties"]["status"] = "validated" collection["properties"]["sequence_label"] = sample_id if update_from: with arvados.collection.Collection( collection["uuid"], api_client=self.api, keep_client=self.keepclient) as update_existing_col: update_existing_col.copy("metadata.yaml", "metadata.yaml", source_collection=col, overwrite=True) update_existing_col.save( properties=collection["properties"]) self.api.collections().delete( uuid=update_from["uuid"]).execute() logging.info("Updated '%s' in validated sequences" % collection["name"]) else: # Move it to the "validated" project to be included in the next analysis self.api.collections().update( uuid=collection["uuid"], body={ "owner_uuid": self.validated_project, "name": "%s (%s)" % (collection["name"], time.asctime(time.gmtime())), "properties": collection["properties"] }).execute() logging.info("Added '%s' to validated sequences" % collection["name"]) return True