def main(): parser = get_parser() args = parser.parse_args() rec_ids = args.records infile = args.infile prop_name = args.property prop_val = args.value # Connect to the Portal dcc_mode = args.dcc_mode if dcc_mode: conn = euc.Connection(dcc_mode) else: # Default dcc_mode taken from environment variable DCC_MODE. conn = euc.Connection() if not rec_ids: # Then get them from input file fh = open(infile) for line in fh: line = line.strip() if not line or line.startswith("#"): continue rec_ids.append(line) e = "" patch_cnt = 0 for i in rec_ids: try: conn.patch({prop_name: prop_val, conn.ENCID_KEY: i}) patch_cnt += 1 except Exception as e: if e.response.status_code == 422: # Unprocessable Entity # Then likely this property is not defined for this record. text = json.loads(e.response.text) print("Can't PATCH record {}: {}".format(i, text["errors"])) print("Finished: PATCHED {} records.".format( str(patch_cnt) + "/" + str(len(rec_ids))))
def main(): parser = get_parser() args = parser.parse_args() rec_ids = args.records infile = args.infile outfile = args.outfile # Connect to the Portal dcc_mode = args.dcc_mode if dcc_mode: conn = euc.Connection(dcc_mode) else: # Default dcc_mode taken from environment variable DCC_MODE. conn = euc.Connection() if not rec_ids: # Then get them from input file fh = open(infile) for line in fh: line = line.strip() if not line or line.startswith("#"): continue rec_ids.append(line) gms = [] for i in rec_ids: rec = conn.get(i) for rep in rec["replicates"]: for gm in rep["library"]["biosample"]["genetic_modifications"]: gm = gm.strip("/").split("/")[-1] if gm not in gms: print(gm) gms.append(gm) fout = open(outfile, "w") for i in gms: fout.write(i) fout.close()
def main(): parser = get_parser() args = parser.parse_args() profile_id = args.profile_id dcc_mode = args.dcc_mode dry_run = args.dry_run no_aliases = args.no_aliases overwrite_array_values = args.overwrite_array_values if dcc_mode: conn = euc.Connection(dcc_mode, dry_run) else: # Default dcc_mode taken from environment variable DCC_MODE. conn = euc.Connection() # Put conn into submit mode: conn.set_submission(True) infile = args.infile patch = args.patch gen = create_payloads(profile_id=profile_id, infile=infile) for payload in gen: if not patch: conn.post(payload, require_aliases=not no_aliases) else: record_id = payload.get(RECORD_ID_FIELD, False) if not record_id: raise Exception( "Can't patch payload {} since there isn't a '{}' field indiciating an identifer for the record to be PATCHED.".format( euu.print_format_dict(payload), RECORD_ID_FIELD)) payload.pop(RECORD_ID_FIELD) payload.update({conn.ENCID_KEY: record_id}) conn.patch(payload=payload, extend_array_values=not overwrite_array_values)
def main(): parser = get_parser() args = parser.parse_args() desc = args.description aws_creds = args.s3creds if aws_creds: aws_creds = aws_creds.split(":") # Connect to the Portal dcc_mode = args.dcc_mode if dcc_mode: conn = euc.Connection(dcc_mode) else: # Default dcc_mode taken from environment variable DCC_MODE. conn = euc.Connection() file_ids = args.file_ids infile = args.infile if infile: fh = open(infile) for line in fh: line = line.strip() if not line or line.startswith("#"): continue file_ids.append(line) fh.close() gcp_bucket = args.gcpbucket gcp_project = args.gcpproject conn.gcp_transfer_from_aws(file_ids=file_ids, gcp_bucket=gcp_bucket, gcp_project=gcp_project, description=desc, aws_creds=aws_creds)
def main(): parser = get_parser() args = parser.parse_args() if args.rm_patch and not args.remove_property: parser.error("No properties to remove were specified. Use --patch if only patching is needed.") if args.remove_property and not args.rm_patch: parser.error("Properties to remove were specified, but --rm-patch flag was not set.") profile_id = args.profile_id dcc_mode = args.dcc_mode dry_run = args.dry_run no_aliases = args.no_aliases overwrite_array_values = args.overwrite_array_values if dcc_mode: conn = euc.Connection(dcc_mode, dry_run) else: # Default dcc_mode taken from environment variable DCC_MODE. conn = euc.Connection() # Put conn into submit mode: conn.set_submission(True) schema = conn.profiles.get_profile_from_id(profile_id) infile = args.infile patch = args.patch rmpatch = args.rm_patch if args.remove_property is not None: props_to_remove = args.remove_property.split(",") gen = create_payloads(schema=schema, infile=infile) for payload in gen: if not patch and not rmpatch: conn.post(payload, require_aliases=not no_aliases) elif rmpatch: record_id = payload.get(RECORD_ID_FIELD, False) if not record_id: raise ValueError( "Can't patch payload {} since there isn't a '{}' field indicating an identifier for the record to be PATCHED.".format( euu.print_format_dict(payload), RECORD_ID_FIELD)) payload.pop(RECORD_ID_FIELD) payload.update({conn.ENCID_KEY: record_id}) conn.remove_and_patch(props=props_to_remove, patch=payload, extend_array_values=not overwrite_array_values) elif patch: record_id = payload.get(RECORD_ID_FIELD, False) if not record_id: raise ValueError( "Can't patch payload {} since there isn't a '{}' field indicating an identifier for the record to be PATCHED.".format( euu.print_format_dict(payload), RECORD_ID_FIELD)) payload.pop(RECORD_ID_FIELD) payload.update({conn.ENCID_KEY: record_id}) conn.patch(payload=payload, extend_array_values=not overwrite_array_values)
def main(): conn = euc.Connection("prod") reg = re.compile("_R2_001") parser = get_parser() args = parser.parse_args() ids = [] fh = open(args.infile) for line in fh: line = line.strip() if not line or line.startswith("#"): continue ids.append(line) for i in ids: h = conn.get_fastqfile_replicate_hash(exp_id=i) for bio_rep in h: for tech_rep in h[bio_rep]: read_files = h[bio_rep][tech_rep].get(2) # read_files is a list of file objects if not read_files: continue for r in read_files: aliases = r["aliases"] for a in aliases: match = reg.search(a) if match: paired_with_name = a.replace( reg.pattern, "_R1_001") payload = {conn.ENCID_KEY: a} payload["paired_with"] = paired_with_name try: conn.patch(payload=payload) except Exception: break break
def test_dry_run_enabled(self): """ Tests the method ``check_dry_run`` for returning True when the ``Connection`` class is instantiated in dry-run mode. """ self.conn = connection.Connection(eu.DCC_DEV_MODE,True) self.assertEqual(True, self.conn.check_dry_run())
def main(): parser = get_parser() args = parser.parse_args() dcc_mode = args.dcc_mode limit = args.limit outfile = args.outfile url = args.url if dcc_mode: conn = euc.Connection(dcc_mode) else: # Default dcc_mode taken from environment variable DCC_MODE. conn = euc.Connection() fout = open(outfile, "w") results = conn.search(limit=limit, url=url) #returns a list of search results fout.write(json.dumps(results)) fout.close()
def main(): parser = get_parser() args = parser.parse_args() url = args.url infile = args.infile if url: conn = euc.Connection("prod") results = conn.search(url=url) else: fh = open(infile) results = json.load(fh) admin = models.User.find_by({"first_name": "Admin"}) if not admin: raise Exception("Could not find the Admin user in the database, which is needed for associating with new records.") created = 0 patched = 0 total = 0 for rec in results: patch = False total += 1 organism = rec["organism"]["scientific_name"] if organism != SPECIES: continue payload = {} label = rec["label"] payload["name"] = label # Check if the target already exists in the database. pulsar_record = models.Target.find_by({"name": label}) upstream = rec["@id"].strip("/").split("/")[-1] if pulsar_record and upstream != pulsar_record["upstream_identifier"]: patch = True elif pulsar_record: continue # Can add support for patch operation later. payload["upstream_identifier"] = upstream payload["user_id"] = admin["id"] xrefs = rec["dbxref"] for ref in xrefs: tokens = ref.split(":") prefix, ref = ref.rsplit(":", 1) if prefix == "ENSEMBL": payload["ensembl"] = ref elif prefix == "UniProtKB": payload["uniprotkb"] = ref elif prefix == "RefSeq": payload["refseq"] = ref print("Creating {}".format(payload)) if patch: target = models.Target(pulsar_record["id"]) target.patch(payload) patched += 1 print("Patched: {}".format(patched)) else: models.Target.post(payload) created += 1 print("Created: {}".format(created)) print("Total processed: {}".format(total))
def main(): parser = get_parser() args = parser.parse_args() url = args.url conn = euc.Connection("prod") results = conn.search(url=url) for i in results: dcc_exp = conn.get(i["@id"]) for rep in dcc_exp["replicates"]: biosample_id = rep["library"]["biosample"]["@id"] bp.biosample(rec_id=biosample_id)
def main(): parser = get_parser() args = parser.parse_args() desc = args.description aws_creds = args.s3creds if aws_creds: aws_creds = aws_creds.split(":") # Connect to the Portal dcc_mode = args.dcc_mode if dcc_mode: conn = euc.Connection(dcc_mode) else: # Default dcc_mode taken from environment variable DCC_MODE. conn = euc.Connection() file_ids = args.file_ids gcp_bucket = args.gcpbucket gcp_project = args.gcpproject conn.gcp_transfer(file_ids=file_ids, gcp_bucket=gcp_bucket, gcp_project=gcp_project, description=desc, aws_creds=aws_creds)
def main(): parser = get_parser() args = parser.parse_args() file_ids = args.file_ids infile = args.infile # Connect to the Portal dcc_mode = args.dcc_mode if dcc_mode: conn = euc.Connection(dcc_mode) else: # Default dcc_mode taken from environment variable DCC_MODE. conn = euc.Connection() if infile: fh = open(infile) for line in fh: line = line.strip() if not line: continue file_ids.append(line) fh.close() for f in file_ids: print("Generating upload credentials for File record '{}'.".format(f)) conn.regenerate_aws_upload_creds(file_id=f)
def main(): parser = get_parser() args = parser.parse_args() outfile = args.outfile # Connect to the Portal dcc_mode = args.dcc_mode if dcc_mode: conn = euc.Connection(dcc_mode) else: # Default dcc_mode taken from environment variable DCC_MODE. conn = euc.Connection() file_ids = args.file_ids infile = args.infile if infile: fh = open(infile) for line in fh: line = line.strip() if not line or line.startswith("#"): continue file_ids.append(line) fh.close() conn.gcp_transfer_urllist(file_ids=file_ids, filename=outfile)
#!/usr/bin/env python3 ### # Nathaniel Watson # [email protected] # 2019-02-25 ### import encode_utils.connection as euc conn = euc.Connection("prod") dico = {} # All released antibodies from Snyder's team that are 'characterized to standards', 'partially characterized', # or 'characterized to standards with exemption' in human datasets. url = "https://www.encodeproject.org/search/?type=AntibodyLot&status=released&lot_reviews.status=characterized+to+standards&targets.organism.scientific_name=H**o+sapiens&characterizations.lab.title=Michael+Snyder%2C+Stanford&lot_reviews.status=partially+characterized&lot_reviews.status=characterized+to+standards+with+exemption" # An AntibodyLot has many Characterizations - one for each target (gene). # A Characterization has many characterization_reviews, one for each cell line, primary cell, tissue, etc. results = conn.search(url=url) for i in results: ab = conn.get(i["@id"]) chars = ab["characterizations"] for char in chars: target = char["target"]["label"] try: reviews = char["characterization_reviews"] except KeyError: continue for r in reviews: lane_status = r["lane_status"] if not lane_status == "compliant": continue
import json import pdb import re import encode_utils.connection as euc import encode_utils.utils as euu import pulsarpy.models as models import pulsarpy.utils protocol_regx = re.compile(r'protocol', re.IGNORECASE) ACCESSION_PROP = "accession" ALIASES_PROP = "aliases" UPSTREAM_PROP = "upstream_identifier" UUID_PROP = "uuid" ENC_CONN = euc.Connection("prod") #ADMIN_USER_ID = models.User.find_by({"email": "*****@*****.**"})["id"] ADMIN_USER_ID = 1 # Biosamples to import for Jessika: # https://www.encodeproject.org/search/?type=Biosample&lab.title=Michael+Snyder%2C+Stanford&award.rfa=ENCODE4&biosample_type=tissue # which sum up to the following 30 accessions: #['ENCBS443KFH', 'ENCBS251BGN', 'ENCBS558OUC', 'ENCBS319AUC', 'ENCBS895UDJ', 'ENCBS303LEQ', 'ENCBS208HYM', 'ENCBS273ENZ', 'ENCBS704NZI', 'ENCBS444XVA', 'ENCBS924ALU', 'ENCBS892KPS', 'ENCBS729ENA', 'ENCBS268DOV', 'ENCBS157OMX', 'ENCBS441BYJ', 'ENCBS858UHJ', 'ENCBS577DQE', 'ENCBS655YSD', 'ENCBS064HXH', 'ENCBS790AKV', 'ENCBS437TFK', 'ENCBS465UNR', 'ENCBS278ABD', 'ENCBS131SFJ', 'ENCBS605RWM', 'ENCBS722FKO', 'ENCBS603AMH', 'ENCBS222RAG', 'ENCBS649EIC'] def set_name(rec): """ Most of the models in Pulsar have a name attribute, and most of the time it is required. When backporting a record from the ENCODE Portal, we need some value to use as the record's name, and records in the Portal don't have a name prop, so we need to use some other propery value.
def main(): global CONN parser = get_parser() args = parser.parse_args() exp_id = args.exp outdir = args.outdir if not os.path.exists(outdir): os.mkdir(outdir) dcc_mode = args.dcc_mode if dcc_mode: CONN = euc.Connection(dcc_mode) else: # Default dcc_mode taken from environment variable DCC_MODE. CONN = euc.Connection() exp = CONN.get(exp_id) # Open output file handles # experiments file exp_file = os.path.join(outdir, EXP_TAB) expfh = open(exp_file, "a") if file_is_empty(exp_file): expfh.write("\t".join(EXP_HEADER) + "\n") # files file file_file = os.path.join(outdir, FILE_TAB) ffh = open(file_file, "a") if file_is_empty(file_file): ffh.write("\t".join(FILE_HEADER) + "\n") # replicates file rep_file = os.path.join(outdir, REP_TAB) repfh = open(rep_file, "a") if file_is_empty(rep_file): repfh.write("\t".join(REP_HEADER) + "\n") # genetic modifications file gm_file = os.path.join(outdir, GM_TAB) gmfh = open(gm_file, "a") if file_is_empty(gm_file): gmfh.write("\t".join(GM_HEADER) + "\n") # biosamples file bio_file = os.path.join(outdir, BIO_TAB) biofh = open(bio_file, "a") if file_is_empty(bio_file): biofh.write("\t".join(BIO_HEADER) + "\n") # libraries file lib_file = os.path.join(outdir, LIB_TAB) libfh = open(lib_file, "a") if file_is_empty(lib_file): libfh.write("\t".join(LIB_HEADER) + "\n") expfh.write("\t") # emtpy for name field in Pulsar expfh.write(exp["accession"] + "\t") exp_alias = exp["aliases"][0] expfh.write(exp_alias + "\t") expfh.write(exp["description"] + "\t") expfh.write(exp["target"]["name"] + "\t") document_aliases = portal_ids_to_aliases(exp["documents"]) expfh.write(",".join(document_aliases) + "\t") submitter_comments = exp.get("submitter_comment", "") expfh.write(submitter_comments + "\t") expfh.write("\t") # empty for notes field in Pulsar expfh.write("\n") # START FILE FILE fastq_files = CONN.get_fastqfiles_on_exp(exp_id) for f in fastq_files: ffh.write("\t") # empty for name field in Pulsar ffh.write(f["accession"] + "\t") ffh.write(f["aliases"][0] + "\t") platform = f["platform"]["aliases"][-1] ffh.write(platform + "\t") ffh.write(f.get("submitted_file_name", "") + "\t") rep = f["replicate"] ffh.write(rep.get("uuid", "") + "\t") ffh.write(rep["aliases"][0] + "\t") ffh.write(f.get("run_type", "") + "\t") controlled_by = f.get("controlled_by", []) ffh.write(",".join(controlled_by) + "\t") ffh.write(f.get("paired_with", "") + "\t") ffh.write(f.get("paired_end", "") + "\t") ffh.write(str(f.get("read_count", "")) + "\t") ffh.write(str(f.get("read_length", "")) + "\t") fc = f.get("flowcell_details", {}) if fc: fc = fc[0] ffh.write(fc.get("barcode", "") + "\t") ffh.write(fc.get("machine", "") + "\t") ffh.write(str(fc.get("lane", "")) + "\t") ffh.write("\n") # START REPLICATE FILE reps = exp["replicates"] for i in reps: repfh.write("\t") # empty for name field in Pulsar repfh.write(i["aliases"][0] + "\t") repfh.write(exp_alias + "\t") lib = i["library"] bio = lib["biosample"] repfh.write("\t") # empty for biosample_id fkey field in Pulsar biosample_alias = bio["aliases"][0] repfh.write(biosample_alias + "\t") repfh.write(str(i["biological_replicate_number"]) + "\t") repfh.write(str(i["technical_replicate_number"]) + "\t") repfh.write("\t") # empty for antibody_id fkey field in Pulsar antibody = i.get("antibody", "") if antibody: repfh.write(antibody["accession"] + "\t") else: repfh.write("\t") repfh.write(i.get("submitter_comment", "") + "\t") repfh.write("\t") # empty for notes field in Pulsar repfh.write("\n") # START BIOSAMPLE FILE biofh.write("\t") # empty for name field in Pulsar biofh.write(bio["accession"] + "\t") biosample_upstream_id = bio["aliases"][0] biofh.write(biosample_upstream_id + "\t") biofh.write(bio.get("part_of", "") + "\t") biofh.write(bio.get("nih_institutional_certification", "") + "\t") pooled_from = bio.get("pooled_from", []) biofh.write(",".join(pooled_from) + "\t") treatment_dicts = bio.get("treatments", {}) treatment_uuids = [x["uuid"] for x in treatment_dicts] treatment_aliases = portal_ids_to_aliases(treatment_uuids) biofh.write(",".join(treatment_aliases) + "\t") document_aliases = portal_ids_to_aliases(bio.get("documents", [])) biofh.write(",".join(document_aliases) + "\t") biofh.write(bio["biosample_type"] + "\t") biofh.write(bio["biosample_term_name"] + "\t") biofh.write(bio["source"]["name"] + "\t") biofh.write(bio.get("product_id", "") + "\t") biofh.write(bio.get("lot_id", "") + "\t") biofh.write(bio["donor"]["aliases"][0] + "\t") biofh.write(bio.get("passage_number", "") + "\t") date_taken = bio.get("culture_start_date", "") if not date_taken: date_taken = bio.get("date_obtained", "") biofh.write(date_taken + "\t") biofh.write(bio.get("submitter_comment", "") + "\t") biofh.write("\t") # empty for notes field in Pulsar biofh.write("\n") # update gm file for gm_id in bio.get("genetic_modifications", []): gm = CONN.get(gm_id) gmfh.write("\t") # empty for name field in Pulsar gmfh.write(gm["accession"] + "\t") gmfh.write(gm["aliases"][0] + "\t") gmfh.write(biosample_upstream_id + "\t") gmfh.write(gm.get("description", "") + "\t") document_aliases = portal_ids_to_aliases(gm.get("documents", [])) gmfh.write(",".join(document_aliases) + "\t") gmfh.write(gm.get("category", "") + "\t") gmfh.write(gm.get("purpose", "") + "\t") gmfh.write(gm.get("method", "") + "\t") guide_seqs = gm.get("guide_rna_sequences", []) gmfh.write(",".join(guide_seqs) + "\t") tags = gm.get("introduced_tags", []) gmfh.write(str(tags) + "\t") reagents = gm.get("reagents") gmfh.write(str(reagents) + "\t") chars = gm.get("characterizations", []) gmfh.write(",".join(chars) + "\t") gmfh.write("\t") # empty for crispr_construct_ids in Pulsar gmfh.write("\t") # empty for donor_construct_id in Pulsar gmfh.write("\t") # empty for notes field in pulsar gmfh.write("\n") # START LIBRARY FILE libfh.write("\t") # empty for name field in Pulsar libfh.write(lib["accession"] + "\t") libfh.write(lib["aliases"][0] + "\t") libfh.write(biosample_upstream_id + "\t") libfh.write(lib["nucleic_acid_term_name"] + "\t") strand_specific = str(lib.get("strand_specificity", False)) libfh.write(strand_specific + "\t") document_aliases = portal_ids_to_aliases(lib["documents"]) libfh.write(",".join(document_aliases) + "\t") libfh.write(lib.get("size_range", "") + "\t") treatment_aliases = portal_ids_to_aliases(lib["treatments"]) libfh.write(",".join(treatment_aliases) + "\t") libfh.write(lib.get("source", "") + "\t") libfh.write(lib.get("product_id", "") + "\t") libfh.write(lib.get("lot_id", "") + "\t") libfh.write(lib.get("fragmentation_method", "") + "\t") libfh.write("\t") # empty for sequencing_library_prep_kit_id field in Pulsar libfh.write("\t") # empty for paired_end field in Pulsar libfh.write("\t") # empty for barcode_id in Pulsar libfh.write("\t") # empty for paired_barcode_id in Pulsar libfh.write(lib.get("submitter_comment", "") + "\t") libfh.write("\t") # empty for notes field in pulsar libfh.write("\n") expfh.close() ffh.close() repfh.close() biofh.close() libfh.close() gmfh.close()
def test_arbitrary_host(self): self.conn = connection.Connection(dcc_mode='test.encodedcc.org')
def setUp(self): self.conn = connection.Connection(eu.DCC_DEV_MODE)