def releasinator(self, name, identifier, status): '''releases objects into their equivalent released states''' patch_dict = {} if name in self.current: log = '%s' % "UPDATING: {} {} with status {} ".format( name, identifier, status) + \ "is now current" patch_dict = {"status": "current"} elif name in self.finished: log = '%s' % "UPDATING: {} {} with status {} ".format( name, identifier, status) + \ "is now finished" patch_dict = {"status": "finished"} else: log = "UPDATING: {} {} with status {} ".format( name, identifier, status) + \ "is now released" patch_dict = {"status": "released"} if name in self.date_released: # if the object would have a date_released give it one now = datetime.datetime.now().date() patch_dict = {"date_released": str(now), "status": "released"} log += " with date {}".format(now) logger.info('%s' % log) if self.PRINTALL: print (log) encodedcc.patch_ENCODE(identifier, self.connection, patch_dict)
def releasinator(self, name, identifier, status): '''releases objects into their equivalent released states''' patch_dict = {} if name in self.current: log = '%s' % "UPDATING: {} {} with status {} ".format( name, identifier, status) + \ "is now current" patch_dict = {"status": "current"} elif name in self.finished: log = '%s' % "UPDATING: {} {} with status {} ".format( name, identifier, status) + \ "is now finished" patch_dict = {"status": "finished"} else: log = "UPDATING: {} {} with status {} ".format( name, identifier, status) + \ "is now released" patch_dict = {"status": "released"} if name in self.date_released: # if the object would have a date_released give it one now = datetime.datetime.now().date() patch_dict = {"date_released": str(now), "status": "released"} log += " with date {}".format(now) logger.info('%s' % log) if self.PRINTALL: print(log) encodedcc.patch_ENCODE(identifier, self.connection, patch_dict)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) profiles = encodedcc.get_ENCODE('/profiles/', connection) for object_type in profiles.keys(): profile_properties = encodedcc.get_ENCODE( '/profiles/' + object_type, connection).get('properties') # we should fix only objects that have alternate accessions property if profile_properties and profile_properties.get( 'alternate_accessions'): uuid_2_alternate_accessions = {} objects = encodedcc.get_ENCODE('search/?type=' + object_type, connection)['@graph'] for entry in objects: if entry.get('alternate_accessions'): replaced_objects_accessions = [] for acc in entry.get('alternate_accessions'): replaced_objects_accessions.extend( retreive_list_of_replaced(acc, connection)) if sorted(list(set( replaced_objects_accessions))) != sorted( entry.get('alternate_accessions')): uuid_2_alternate_accessions[entry['uuid']] = \ set(replaced_objects_accessions) for uuid in uuid_2_alternate_accessions.keys(): uuid_sets_counter = 0 for key in uuid_2_alternate_accessions.keys(): if uuid_2_alternate_accessions[uuid] <= \ uuid_2_alternate_accessions[key]: uuid_sets_counter += 1 if uuid_sets_counter == 1: for acc in list(uuid_2_alternate_accessions[uuid]): to_clean_objects = encodedcc.get_ENCODE( 'search/?type=Item&accession=' + acc, connection)['@graph'] for object_to_clean in to_clean_objects: print(object_to_clean['uuid'] + ' alternate accessions list ' + str(object_to_clean[ 'alternate_accessions']) + ' is removed') encodedcc.patch_ENCODE( object_to_clean['uuid'], connection, {"alternate_accessions": []}) print(uuid + ' is patched with ' + str({"alternate_accessions": list( uuid_2_alternate_accessions[uuid])})) encodedcc.patch_ENCODE( uuid, connection, {"alternate_accessions": list( uuid_2_alternate_accessions[uuid])})
def updater(self, exp, con): ''' helper function runs the update step''' temp = encodedcc.get_ENCODE(exp, self.connection).get("controlled_by", []) if con not in temp: control = temp + [con] patch_dict = {"controlled_by": control} print("patching experiment file {} with controlled_by {}".format(exp, con)) encodedcc.patch_ENCODE(exp, self.connection, patch_dict) else: print("ERROR: controlled_by for experiment file {} already contains {}".format(exp, con))
def file_manager(key, value, connection, obj_type): filename = key.split("/")[-1] print("Downloading {}".format(filename)) r = requests.get(key) with open(filename, "wb") as outfile: outfile.write(r.content) if obj_type == "Biosample": filepart = filename.split("-")[0] else: filepart = filename.split("-")[1] attach = attachment(filename) temp = "_".join(key.split("/")[-2:]) aliases = ["brenton-graveley:" + temp] if (encodedcc.get_ENCODE(quote(aliases[0]), connection)['status']) != 'error': removing_patch = {'status': 'deleted', 'aliases': []} print('DELETING ' + aliases[0] + ' ' + str(removing_patch)) encodedcc.patch_ENCODE(quote(aliases[0]), connection, removing_patch) upload = { "aliases": aliases, "attachment": attach, "award": "U54HG007005", "document_type": "general protocol", "lab": "/labs/brenton-graveley/", "status": "released", "description": "{obj_type} protocol for {filepart} shRNA followed by RNA-seq".format( obj_type=obj_type, filepart=filepart), } print("Uploading {} as {}".format(filename, aliases[0])) encodedcc.new_ENCODE(connection, "Document", upload) print("Patching {} with document {}".format(value, aliases[0])) if obj_type == "Biosample": docs = {"protocol_documents": aliases} else: docs = {"documents": aliases} encodedcc.patch_ENCODE(quote(value), connection, docs) print("Removing document {}".format(filename)) subprocess.run(["rm", filename]) '''
def check_ENCODE(self, idList, connection, otherIdList=[], bothDicts={}): for pmid in idList: extraData = bothDicts.get(pmid) ENCODEvalue = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=PMID:" + pmid, connection) if ENCODEvalue.get("@graph"): log = "PMID " + pmid + " is listed in ENCODE" logger.info('%s' % log) uuid = ENCODEvalue.get("@graph")[0].get("uuid") if not self.CREATE_ONLY: self.compare_entrez_ENCODE(uuid, pmid, connection, extraData) else: if self.CREATE_ONLY: self.get_entrez([pmid]) titleEntrez = self.entrezDict[pmid].get("title") found = False for otherID in otherIdList: titleENCODE = encodedcc.get_ENCODE("/search/?type=publication&searchTerm=" + otherID, connection) if titleENCODE.get("title") == titleEntrez: log = pmid + " is in ENCODE by a different name " + titleENCODE.get("uuid") logger.warning('%s' % log) self.compare_entrez_ENCODE(titleENCODE.get("uuid"), pmid, connection, extraData) if self.UPDATE: newIdent = titleENCODE.get("identifiers") newIdent.append("PMID:" + pmid) patch_dict = {"identifiers": newIdent} encodedcc.patch_ENCODE(titleENCODE.get("uuid"), connection, patch_dict) found = True if found is False: log = "This publication is not listed in ENCODE " + pmid logger.warning('%s' % log) if self.CREATE: self.POST_COUNT += 1 pmidData = self.entrezDict[pmid] log = "POSTing the new object: " + pmid logger.info('%s' % log) post_dict = { "title": pmidData.get("title"), "abstract": pmidData.get("abstract"), "submitted_by": "/users/8b1f8780-b5d6-4fb7-a5a2-ddcec9054288/", "lab": "/labs/encode-consortium/", "award": "/awards/ENCODE/", "categories": extraData.get("categories"), "published_by": extraData.get("published_by"), "date_published": pmidData.get("date_published"), "authors": pmidData.get("authors"), "identifiers": ["PMID:" + pmid], "journal": pmidData.get("journal"), "volume": pmidData.get("volume"), "issue": pmidData.get("issue"), "page": pmidData.get("page"), "status": "published" } if extraData.get("data_used"): post_dict["data_used"] = extraData.get("data_used") encodedcc.new_ENCODE(connection, "publications", post_dict)
def excel_reader(datafile, sheet, update, connection, patchall): row = reader(datafile, sheetname=sheet) keys = next(row) # grab the first row of headers total = 0 error = 0 success = 0 patch = 0 for values in row: total += 1 post_json = dict(zip(keys, values)) post_json = dict_patcher(post_json) # add attchments here if post_json.get("attachment"): attach = attachment(post_json["attachment"]) post_json["attachment"] = attach print(post_json) temp = {} if post_json.get("uuid"): temp = encodedcc.get_ENCODE(post_json["uuid"], connection) elif post_json.get("aliases"): temp = encodedcc.get_ENCODE(quote(post_json["aliases"][0]), connection) elif post_json.get("accession"): temp = encodedcc.get_ENCODE(post_json["accession"], connection) elif post_json.get("@id"): temp = encodedcc.get_ENCODE(post_json["@id"], connection) if temp.get("uuid"): if patchall: e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 patch += 1 else: print("Object {} already exists. Would you like to patch it instead?".format(temp["uuid"])) i = input("PATCH? y/n ") if i.lower() == "y": e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 patch += 1 else: if update: print("POSTing data!") e = encodedcc.new_ENCODE(connection, sheet, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 print("{sheet}: {success} out of {total} posted, {error} errors, {patch} patched".format( sheet=sheet.upper(), success=success, total=total, error=error, patch=patch))
def updater(self, exp, con): ''' helper function runs the update step''' temp = encodedcc.get_ENCODE(exp, self.connection).get("controlled_by", []) if con not in temp: control = temp + [con] patch_dict = {"controlled_by": control} print("patching experiment file {} with controlled_by {}".format( exp, con)) encodedcc.patch_ENCODE(exp, self.connection, patch_dict) else: print( "ERROR: controlled_by for experiment file {} already contains {}" .format(exp, con))
def renamer(file, connection, update): patch_dict = {} aliases = file.get("aliases", []) submitted = file.get("submitted_file_name", "").rstrip("_rm") submitted = submitted + "_rm" patch_dict["submitted_file_name"] = submitted if any(aliases): alias = aliases[0].rstrip("_replaced") alias = [alias + "_replaced"] patch_dict["aliases"] = alias else: print("skipping {} with no aliases".format(file["@id"])) print("file {} with data {}".format(file["@id"], patch_dict)) if update: encodedcc.patch_ENCODE(file["@id"], connection, patch_dict)
def file_manager(key, value, connection, obj_type): filename = key.split("/")[-1] print("Downloading {}".format(filename)) r = requests.get(key) with open(filename, "wb") as outfile: outfile.write(r.content) if obj_type == "Biosample": filepart = filename.split("-")[0] else: filepart = filename.split("-")[1] attach = attachment(filename) temp = "_".join(key.split("/")[-2:]) aliases = ["brenton-graveley:" + temp] if (encodedcc.get_ENCODE(quote(aliases[0]), connection)['status']) != 'error': removing_patch = {'status':'deleted', 'aliases': []} print ('DELETING ' + aliases[0] + ' ' + str(removing_patch)) encodedcc.patch_ENCODE(quote(aliases[0]), connection, removing_patch) upload = {"aliases": aliases, "attachment": attach, "award": "U54HG007005", "document_type": "general protocol", "lab": "/labs/brenton-graveley/", "status": "released", "description": "{obj_type} protocol for {filepart} shRNA followed by RNA-seq".format(obj_type=obj_type, filepart=filepart), } print("Uploading {} as {}".format(filename, aliases[0])) encodedcc.new_ENCODE(connection, "Document", upload) print("Patching {} with document {}".format(value, aliases[0])) if obj_type == "Biosample": docs = {"protocol_documents": aliases} else: docs = {"documents": aliases} encodedcc.patch_ENCODE(quote(value), connection, docs) print("Removing document {}".format(filename)) subprocess.run(["rm", filename]) '''
def uploader(file_object, update): aws_return_code = encodedcc.upload_file(file_object, update) if aws_return_code: logger.warning('Row %d: Non-zero AWS upload return code %d' % (aws_return_code)) print("Retrying upload to S3...") creds = file_object["upload_credentials"] expire = parse(creds["expiration"]).date() now = datetime.datetime.now().date() if now > expire: new_file_object = encodedcc.ENC_Item(connection, file_object["@id"]) print("Your upload credentials are stale. Getting new credentials.") file_object = new_file_object.new_creds() aws_retry = encodedcc.upload_file(file_object, update) if aws_retry: logger.warning('Row %d: Non-zero AWS upload return code %d' % (aws_retry)) encodedcc.patch_ENCODE(file_object["@id"], connection, {"status": "upload failed"}) return aws_return_code
def replacer(file, connection, update): if file.get("aliases"): # this has aliases if file["aliases"][0].endswith("_replaced"): # this is one of the old ones alias = file["aliases"][0].rstrip("_replaced") old_acc = file["accession"] old_date = file["date_created"] print(old_acc) new = encodedcc.get_ENCODE(quote(alias), connection) new_acc = new["accession"] new_date = new["date_created"] patch_dict = {"status": "replaced", "alternate_accessions": [alias]} #print("file {} with date {} replaces file {} with date {}".format(new_acc, new_date, old_acc, old_date)) if update: encodedcc.patch_ENCODE(file["@id"], connection, patch_dict) else: print("file {} has no aliases".format(file["@id"]))
def replacer(file, connection, update): if file.get("aliases"): # this has aliases if file["aliases"][0].endswith("_replaced"): # this is one of the old ones alias = file["aliases"][0].rstrip("_replaced") old_acc = file["accession"] old_date = file["date_created"] print(old_acc) new = encodedcc.get_ENCODE(quote(alias), connection) new_acc = new["accession"] new_date = new["date_created"] patch_dict = { "status": "replaced", "alternate_accessions": [alias] } #print("file {} with date {} replaces file {} with date {}".format(new_acc, new_date, old_acc, old_date)) if update: encodedcc.patch_ENCODE(file["@id"], connection, patch_dict) else: print("file {} has no aliases".format(file["@id"]))
def uploader(file_object, update): aws_return_code = encodedcc.upload_file(file_object, update) if aws_return_code: logger.warning('Row %d: Non-zero AWS upload return code %d' % (aws_return_code)) print("Retrying upload to S3...") creds = file_object["upload_credentials"] expire = parse(creds["expiration"]).date() now = datetime.datetime.now().date() if now > expire: new_file_object = encodedcc.ENC_Item( connection, file_object["@id"]) print("Your upload credentials are stale. Getting new credentials.") file_object = new_file_object.new_creds() aws_retry = encodedcc.upload_file(file_object, update) if aws_retry: logger.warning( 'Row %d: Non-zero AWS upload return code %d' % (aws_retry)) encodedcc.patch_ENCODE(file_object["@id"], connection, { "status": "upload failed"}) return aws_return_code
def excel_reader(datafile, sheet, update, connection, patchall): row = reader(datafile, sheetname=sheet) keys = next(row) # grab the first row of headers total = 0 error = 0 success = 0 patch = 0 json_properties = encodedcc.get_ENCODE('/profiles/{}.json'.format(sheet), connection)['properties'] new_accessions_aliases = [] failed_postings = [] for values in row: total += 1 post_json = dict(zip(keys, values)) post_json = dict_patcher(post_json) post_json = expose_objects(post_json, json_properties) # add attchments here if post_json.get("attachment"): attach = attachment(post_json["attachment"]) post_json["attachment"] = attach print(post_json) temp = {} # Silence get_ENCODE failures. with encodedcc.print_muted(): if post_json.get("uuid"): temp = encodedcc.get_ENCODE(post_json["uuid"], connection) elif post_json.get("aliases"): temp = encodedcc.get_ENCODE(quote(post_json["aliases"][0]), connection) elif post_json.get("accession"): temp = encodedcc.get_ENCODE(post_json["accession"], connection) elif post_json.get("@id"): temp = encodedcc.get_ENCODE(post_json["@id"], connection) if temp.get("uuid"): if patchall: e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 patch += 1 else: print( "Object {} already exists. Would you like to patch it instead?" .format(temp["uuid"])) i = input("PATCH? y/n ") if i.lower() == "y": e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 patch += 1 else: if update: print("POSTing data!") e = encodedcc.new_ENCODE(connection, sheet, post_json) if e["status"] == "error": error += 1 failed_postings.append( post_json.get('aliases', 'alias not specified')) elif e["status"] == "success": new_object = e['@graph'][0] # Print now and later. print('New accession/UUID: {}'.format( (new_object.get('accession', new_object.get('uuid'))))) new_accessions_aliases.append( (new_object.get('accession', new_object.get('uuid')), new_object.get('aliases'))) success += 1 print( "{sheet}: {success} out of {total} posted, {error} errors, {patch} patched" .format(sheet=sheet.upper(), success=success, total=total, error=error, patch=patch)) if new_accessions_aliases: print('New accession/UUID and alias:' if len(new_accessions_aliases) == 1 else 'New accessions/UUIDs and aliases:') for (accession, alias) in new_accessions_aliases: if len(alias) == 0: alias = 'alias not specified' else: alias = ', '.join(alias) if isinstance(alias, list) else alias print(accession, alias) if failed_postings: print('Posting failed for {} object(s):'.format(len(failed_postings))) for alias in failed_postings: print(', '.join(alias) if isinstance(alias, list) else alias)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on", connection.server) if args.update: assert args.user, "A user must be provided to run this script!" user = encodedcc.get_ENCODE(args.user, connection).get("@id") assert user, "{} was not found in the ENCODE database as a registered user. Please try again".format(args.user) data = [] idList = [] with open(args.infile, "r") as tsvfile: reader = csv.DictReader(tsvfile, delimiter='\t') for row in reader: data.append(row) for item in data: lanes = item.get("lanes", "") lanes = list(set(lanes.split(","))) item["lanes"] = lanes if not any(item["notes"]): item.pop("notes") if item.get("@id") not in idList: idList.append(item["@id"]) objDict = {key: [] for key in idList} for item in data: objDict.get(item.get("@id", ""), "").append(item) for idNum in objDict.keys(): antibody = encodedcc.get_ENCODE(idNum, connection, frame="edit") new_antibody = {} if antibody.get("primary_characterization_method"): reviews = antibody.get("characterization_reviews", []) enc_docs = antibody.get("documents", []) file_docs = [] for obj in objDict[idNum]: if obj.get("documents"): for doc in obj["documents"].split(","): file_docs.append(doc) if obj.get("notes"): new_antibody["notes"] = obj["notes"] for doc in file_docs: if ":" in doc: doc = quote(doc) link = encodedcc.get_ENCODE(doc, connection).get("@id") if link: if link not in enc_docs: enc_docs.append(link) ####################### # begin lanes checking ####################### enc_lanes_check = [] file_lanes_check = [] flag = False for r in reviews: enc_lanes_check.append(r["lane"]) for item in objDict[idNum]: for l in item["lanes"]: file_lanes_check.append(int(l)) if len(set(enc_lanes_check)) < len(enc_lanes_check): # duplicate lanes in ENCODE print("Possible duplicate lanes in ENCODE") flag = True if len(set(file_lanes_check)) < len(file_lanes_check): # duplicate lanes in file print("Possible duplicate lanes in file") flag = True if len(set(enc_lanes_check) - set(file_lanes_check)) > 0: # more lanes in ENCODE than in file print("Found lanes in ENCODE not in the file") flag = True if len(set(file_lanes_check) - set(enc_lanes_check)) > 0: # more lanes in file than in ENCODE print("Found lanes in the file not in ENCODE") flag = True if flag: print("Some problem was found with the number of lanes in the file as compared to ENCODE") print("Do you want to continue running the program or exit and check the data?") i = input("Continue? y/n ") assert i.upper() == "Y" # exit the script for r in reviews: for line in objDict[idNum]: for lane in line["lanes"]: if int(lane) == r["lane"]: if line["lane_status"].lower() == "pending dcc review": print("can't set to pending review, need manual override") fin = input("Change the status to 'pending dcc review'? y/n ") if fin.upper() == "Y": r["lane_status"] = line["lane_status"].lower() for link in enc_docs: if encodedcc.get_ENCODE(link, connection).get("document_type", "") == "standards document": enc_docs.pop(link) else: pass else: r["lane_status"] = line["lane_status"].lower() # now all lanes in reviews should be updated to document enc_comp = 0 enc_ncomp = 0 other = 0 for r in reviews: if r.get("lane_status", "") == "compliant": enc_comp = enc_comp + 1 elif r.get("lane_status", "") == "not compliant": enc_ncomp = enc_ncomp + 1 else: other = other + 1 if other > 0: print("not all lanes have allowed status, antibody characterization status set to not compliant") new_antibody["status"] = "not compliant" elif enc_comp > 0: new_antibody["status"] = "compliant" elif other == 0 and enc_comp == 0 and enc_ncomp > 0: new_antibody["status"] = "not compliant" ###################### # end lanes checking ###################### if antibody.get("lab", "") == "/labs/michael-snyder/": # make sure special document is added if not in the file if "michael-snyder:biorad_protein_standard" not in file_docs: file_docs.append("michael-snyder:biorad_protein_standard") if antibody["primary_characterization_method"] == "immunoprecipitation": if len(reviews) == 1: # fix lane number reviews[0]["lane"] = 3 new_antibody["characterization_reviews"] = reviews new_antibody["documents"] = enc_docs if args.update: new_antibody["reviewed_by"] = user if args.update: print("PATCHing antibody characterization", idNum) encodedcc.patch_ENCODE(idNum, connection, new_antibody) else: print("PATCH data:", new_antibody)
def main(): args = getArgs() outfile = args.outfile CREATE_ONLY = args.createonly UPDATE_ONLY = args.updateonly Entrez.email = args.email key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) if args.debug: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.INFO) print("Running on ", connection.server) publication = PublicationUpdate(args) if not UPDATE_ONLY: publication.setup_publication() pmidList = publication.consortium_ids + publication.community_ids mergeDicts = publication.consortium_dict.copy() mergeDicts.update(publication.community_dict) # holds published_by, categories, and data_used if not CREATE_ONLY: publication.get_entrez(pmidList) community_ENCODE_Only, communityOtherID, consortium_ENCODE_Only, consortiumOtherID = publication.find_ENCODE_extras(publication.community_ids, publication.consortium_ids, connection) total_ENCODE_only = len(community_ENCODE_Only) + len(consortium_ENCODE_Only) allOtherIDs = communityOtherID + consortiumOtherID publication.check_ENCODE(pmidList, connection, allOtherIDs, mergeDicts) log = str(total_ENCODE_only) + " items in ENCODE but not in files" logger.info('%s' % log) log = str(publication.PATCH_COUNT) + " publication files PATCHed" logger.info('%s' % log) log = str(publication.POST_COUNT) + " publication files POSTed" logger.info('%s' % log) print("Results printed to", outfile) else: infile = UPDATE_ONLY with open(infile, 'r') as readfile: uuidList = [x.rstrip('\n') for x in readfile] # check each publication to see if it has a PMID, if it does add it to the PMIDlist # if it does not have one look it up on Entrez pmid_uuid_dict = {} for uuid in uuidList: pub = encodedcc.get_ENCODE(uuid, connection) title = pub.get("title", "") identifiers = pub.get("identifiers", []) found = False for i in identifiers: if "PMID:" in i: p = i.split(":")[1] found = True if found: pmid_uuid_dict[p] = uuid else: # search Entrez for publication by title handle = Entrez.esearch(db="pubmed", term=title) record = Entrez.read(handle) idlist = record["IdList"] if len(idlist) > 1: log = "More than one possible PMID found for " + uuid logger.error('%s' % log) log = str(idlist) + " are possible PMIDs" logger.error('%s' % log) elif len(idlist) == 0: log = "No possible PMID found for " + uuid logger.error('%s' % log) else: handle = Entrez.efetch(db="pubmed", id=idlist, rettype="medline", retmode="text") records = Medline.parse(handle) # save the records, you can convert them to a list records = list(records) for record in records: pm = record.get("PMID") ti = record.get("TI") log = "Publication " + uuid + " with title \"" + title + "\" matches PMID:" + pm + " with title \"" + ti + "\"" logger.info('%s' % log) identifiers.append("PMID:" + pm) encodedcc.patch_ENCODE(uuid, connection, {"identifiers": identifiers}) pmid_uuid_dict[pm] = uuid pmidList = list(pmid_uuid_dict.keys()) publication.get_entrez(pmidList) with open("pub_update.txt", "w") as f: for pmid in pmid_uuid_dict.keys(): publication.compare_entrez_ENCODE(pmid_uuid_dict[pmid], pmid, connection) f.write(str(len(pmid_uuid_dict.keys())) + " publications checked " + str(publication.PATCH_COUNT) + " publications PATCHed")
def compare_entrez_ENCODE(self, uuid, pmid, connection, extraData={}): '''compares value in ENCODE database to results from Entrez ''' encode = encodedcc.get_ENCODE(uuid, connection) entrez = self.entrezDict.get(pmid) patch = False if not entrez: log = "PMID " + pmid + " was not found in Entrez database!!" logger.warning('%s' % log) else: log = "PMID " + pmid logger.info('%s' % log) for key in entrez.keys(): if key in encode.keys(): if entrez[key] == encode[key]: log = "entrez key \"" + key + "\" matches encode key" logger.info('%s' % log) else: log = "\"" + key + "\" value in encode database does not match value in entrez database" logger.warning('%s' % log) log = "\tENTREZ: " + entrez[key] + "\n\tENCODE: " + encode[key] logger.warning('%s' % log) if self.UPDATE or self.UPDATE_ONLY: log = "PATCH in the new value for \"" + key + "\"" logger.info('%s' % log) patch_dict = {key: entrez[key]} encodedcc.patch_ENCODE(uuid, connection, patch_dict) patch = True else: log = "ENCODE missing \"" + key + "\" from Entrez. New key and value must be added" logger.warning('%s' % log) if self.UPDATE or self.UPDATE_ONLY: log = "PATCHing in new key \"" + key + "\"" logger.info('%s' % log) patch_dict = {key: entrez[key]} encodedcc.patch_ENCODE(uuid, connection, patch_dict) patch = True if not self.UPDATE_ONLY: for key in extraData.keys(): if type(extraData.get(key)) is list: if set(encode.get(key, [])) == set(extraData.get(key, [])): log = "encode \"" + key + "\" matches data in file" logger.info('%s' % log) else: log = "encode \"" + key + "\" value" + str(encode.get(key, [])) + "does not match file" logger.warning('%s' % log) if self.UPDATE: if any(extraData[key]): patch_dict = {key: extraData[key]} encodedcc.patch_ENCODE(uuid, connection, patch_dict) patch = True else: log = "No value in file to input for \"" + key + "\"" logger.warning('%s' % log) if type(extraData.get(key)) is str: if encode.get(key, "") == extraData.get(key, ""): log = "encode \"" + key + "\" matches data in file" logger.info('%s' % log) else: log = "encode \"" + key + "\" value" + str(encode.get(key, "")) + "does not match file" logger.warning('%s' % log) if self.UPDATE: patch_dict = {key: extraData[key]} encodedcc.patch_ENCODE(uuid, connection, patch_dict) patch = True if encode.get("status", "") != "published" and (self.UPDATE or self.UPDATE_ONLY): log = "Setting status to published" logger.info('%s' % log) encodedcc.patch_ENCODE(uuid, connection, {"status": "published"}) patch = True if patch is True: self.PATCH_COUNT += 1
def main(): import argparse parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument( '--infile', '-i', help="File containing the JSON object as a JSON string.") parser.add_argument('--server', help="Full URL of the server.") parser.add_argument('--key', default='default', help="The keypair identifier from the keyfile. \ Default is --key=default") parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --keyfile\ =%s" % (os.path.expanduser("~/keypairs.json"))) parser.add_argument('--authid', help="The HTTP auth ID.") parser.add_argument('--authpw', help="The HTTP auth PW.") parser.add_argument( '--force-put', default=False, action='store_true', help="Force the object to be PUT rather than PATCHed. \ Default is False.") parser.add_argument('--get-only', default=False, action='store_true', help="Do nothing but get the object and print it. \ Default is False.") parser.add_argument('--id', help="URI for an object"), parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") parser.add_argument( '--frame', help= "define a frame to get back the JSON object, for use with --id. Default is frame=object", default="object") parser.add_argument('--type', help="the object's type") parser.add_argument( '--update', default=False, action='store_true', help="Let the script PATCH/POST the data. Default is False") args = parser.parse_args() global DEBUG_ON DEBUG_ON = args.debug if args.get_only: GET_ONLY = True else: GET_ONLY = False key = encodedcc.ENC_Key(args.keyfile, args.key) if args.server and args.authpw and args.authid: key.server = args.server key.authid = args.authid key.authpw = args.authpw print("Creating authorization data from command line inputs") connection = encodedcc.ENC_Connection(key) print("Running on {}".format(connection.server)) if args.update: print( "This is an UPDATE run! Data will be PATCHed or POSTed accordingly" ) else: print("This is a dry run, no data will be changed") new_object = False if args.id: GET_ONLY = True print("Taking id to get from --id") new_json = {} uuid_response = {} accession_response = {} try: id_response = encodedcc.get_ENCODE(args.id, connection, frame=args.frame) except: id_response = {} new_object = True else: if args.infile: infile = open(args.infile, 'r') else: infile = sys.stdin new_json_string = infile.read() new_json = json.loads(new_json_string) if args.debug: encodedcc.pprint_ENCODE(new_json) if '@id' in new_json: id_response = encodedcc.get_ENCODE(new_json['@id'], connection) if id_response.get("code") == 404: id_response = {} new_object = True else: id_response = {} new_object = True if 'uuid' in new_json: uuid_response = encodedcc.get_ENCODE(new_json['uuid'], connection) if uuid_response.get("code") == 404: uuid_response = {} new_object = True else: uuid_response = {} new_object = True if 'accession' in new_json: accession_response = encodedcc.get_ENCODE(new_json['accession'], connection) if accession_response.get("code") == 404: accession_response = {} new_object = True else: accession_response = {} new_object = True if new_object: print( "No identifier in new JSON object. Assuming POST or PUT with auto-accessioning." ) object_exists = False if id_response: object_exists = True print("Found matching @id:") encodedcc.pprint_ENCODE(id_response) if uuid_response: object_exists = True print("Found matching uuid:") encodedcc.pprint_ENCODE(uuid_response) if accession_response: object_exists = True print("Found matching accession") encodedcc.pprint_ENCODE(accession_response) if id_response and uuid_response and (id_response != uuid_response): print("Existing id/uuid mismatch") if id_response and accession_response and (id_response != accession_response): print("Existing id/accession mismatch") if uuid_response and accession_response and (uuid_response != accession_response): print("Existing uuid/accession mismatch") if new_object and object_exists: print( "Conflict: At least one identifier already exists and at least one does not exist" ) profiles = encodedcc.get_ENCODE("/profiles/", connection) supported_collections = list(profiles.keys()) if "Dataset" not in supported_collections: supported_collections.append("Dataset") type_list = new_json.pop('@type', []) if args.type: type_list = [args.type] if any(type_list): findit = False for x in supported_collections: if x.lower() == type_list[0].lower(): type_list = [x] findit = True if findit: if args.debug: print("Object will have type of", type_list[0]) else: print( "Error! JSON object does not contain one of the supported types" ) print("Provided type:", type_list[0]) print( "Please either change the JSON file or define the type with the --type feature" ) sys.exit(1) else: print("No type found for JSON object!") sys.exit(1) possible_collections = [x for x in type_list if x in supported_collections] if possible_collections: # collection = possible_collections[0] + 's/' collection = possible_collections[0] else: collection = [] if '@id' in new_json: identifier = new_json.pop('@id') elif 'uuid' in new_json: if collection: identifier = '/' + collection + '/' + new_json['uuid'] + '/' else: identifier = '/' + new_json['uuid'] + '/' elif 'accession' in new_json: if collection: identifier = '/' + collection + '/' + new_json['accession'] + '/' else: identifier = '/' + new_json['accession'] + '/' if 'attachment' in new_json: if 'href' in new_json['attachment']: pass else: try: filename = new_json['attachment']['download'] print("Setting filename to %s" % (filename)) except: print("Must specify either href or filename for attachment", file=sys.stderr) if new_json['attachment'].get('type'): mime_type = new_json['attachment'].get('type') else: try: mime_type, encoding = mimetypes.guess_type(filename) major, minor = mime_type.split('/') #detected_type = magic.from_file(filename, mime=True) print("Detected mime type %s" % (mime_type)) except: print("Failed to detect mime type in file %s" % (filename), file=sys.stderr) try: with open(filename, 'rb') as stream: print("opened") newvalue = { 'download': filename, # Just echoes the given filename as the download name 'type': mime_type, 'href': 'data:%s;base64,%s' % (mime_type, b64encode(stream.read())) } f = open('tmp', 'w') print(f, newvalue) new_json.update({'attachment': newvalue}) # add except: print("Cannot open file %s" % (filename), file=sys.stderr) if object_exists: if args.force_put: if not GET_ONLY: print("Replacing existing object") if args.update: e = encodedcc.replace_ENCODE(identifier, connection, new_json) print(e) else: if not GET_ONLY: print("PATCHing existing object") if args.update: e = encodedcc.patch_ENCODE(identifier, connection, new_json) print(e) elif new_object: if args.force_put: if not GET_ONLY: print("PUT'ing new object") if args.update: e = encodedcc.replace_ENCODE(identifier, connection, new_json) print(e) else: if not GET_ONLY: print("POST'ing new object") if not any(collection): print( "ERROR: Unable to POST to non-existing collection {}". format(collection)) sys.exit(1) if args.update: e = encodedcc.new_ENCODE(connection, collection, new_json) print(e)
def excel_reader(datafile, sheet, update, connection, patchall): row = reader(datafile, sheetname=sheet) keys = next(row) # grab the first row of headers total = 0 error = 0 success = 0 patch = 0 for values in row: total += 1 post_json = dict(zip(keys, values)) post_json = dict_patcher(post_json) # add attchments here if post_json.get("attachment"): attach = attachment(post_json["attachment"]) post_json["attachment"] = attach print(post_json) temp = {} if post_json.get("uuid"): temp = encodedcc.get_ENCODE(post_json["uuid"], connection) elif post_json.get("aliases"): temp = encodedcc.get_ENCODE(quote(post_json["aliases"][0]), connection) elif post_json.get("accession"): temp = encodedcc.get_ENCODE(post_json["accession"], connection) elif post_json.get("@id"): temp = encodedcc.get_ENCODE(post_json["@id"], connection) if temp.get("uuid"): if patchall: e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 patch += 1 else: print( "Object {} already exists. Would you like to patch it instead?" .format(temp["uuid"])) i = input("PATCH? y/n ") if i.lower() == "y": e = encodedcc.patch_ENCODE(temp["uuid"], connection, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 patch += 1 else: if update: print("POSTing data!") e = encodedcc.new_ENCODE(connection, sheet, post_json) if e["status"] == "error": error += 1 elif e["status"] == "success": success += 1 print( "{sheet}: {success} out of {total} posted, {error} errors, {patch} patched" .format(sheet=sheet.upper(), success=success, total=total, error=error, patch=patch))
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on {}".format(connection.server)) accessions = [] if args.object: if os.path.isfile(args.object): accessions = [line.strip() for line in open(args.object)] else: accessions = args.object.split(",") elif args.query: if "search" in args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) else: temp = [encodedcc.get_ENCODE(args.query, connection)] if any(temp): for obj in temp: if obj.get("accession"): accessions.append(obj["accession"]) elif obj.get("uuid"): accessions.append(obj["uuid"]) elif obj.get("@id"): accessions.append(obj["@id"]) else: print("ERROR: object has no identifier", file=sys.stderr) if len(accessions) == 0: print("No accessions to check!", file=sys.stderr) sys.exit(1) for acc in accessions: files = encodedcc.get_ENCODE(acc, connection).get("original_files", []) new_files = {} old_files = {} for f in files: file = encodedcc.get_ENCODE(f, connection) #renamer(file, connection, args.update) #replacer(file, connection, args.update) if any(file.get("aliases", [])): # this has aliases if file["aliases"][0].endswith("_replaced"): # this is one of the old ones dict_maker(file, old_files) else: # this is a new file dict_maker(file, new_files) else: print("file {} has no aliases".format(file["@id"])) for new in new_files.keys(): new_temp = new_files[new] for old in old_files.keys(): old_temp = old_files[old] if new_temp["replicate"] == old_temp["replicate"]: #print(new_temp["replicate"], old_temp["replicate"]) if new_temp["file_type"] == old_temp["file_type"]: #print(new_temp["file_type"], old_temp["file_type"]) if new_temp["run_type"] == old_temp["run_type"]: #print(new_temp["run_type"], old_temp["run_type"]) if new_temp["paired_end"] == old_temp[ "paired_end"]: #print(new_temp["paired_end"], old_temp["paired_end"]) print( "New file {} with date {} replacing old file {} with date {}" .format(new, new_temp["date"], old, old_temp["date"])) if args.update: #replace old file encodedcc.patch_ENCODE( old, connection, {"status": "replaced"}) # release and update new file patch_dict = { "status": "released", "alternate_accessions": [old] } encodedcc.patch_ENCODE( new, connection, patch_dict)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on", connection.server) if args.update: assert args.user, "A user must be provided to run this script!" user = encodedcc.get_ENCODE(args.user, connection).get("@id") assert user, "{} was not found in the ENCODE database as a registered user. Please try again".format( args.user) data = [] idList = [] with open(args.infile, "r") as tsvfile: reader = csv.DictReader(tsvfile, delimiter='\t') for row in reader: data.append(row) for item in data: lanes = item.get("lanes", "") lanes = list(set(lanes.split(","))) item["lanes"] = lanes if not any(item["notes"]): item.pop("notes") if item.get("@id") not in idList: idList.append(item["@id"]) objDict = {key: [] for key in idList} for item in data: objDict.get(item.get("@id", ""), "").append(item) for idNum in objDict.keys(): antibody = encodedcc.get_ENCODE(idNum, connection, frame="edit") new_antibody = {} if antibody.get("primary_characterization_method"): reviews = antibody.get("characterization_reviews", []) enc_docs = antibody.get("documents", []) file_docs = [] for obj in objDict[idNum]: if obj.get("documents"): for doc in obj["documents"].split(","): file_docs.append(doc) if obj.get("notes"): new_antibody["notes"] = obj["notes"] for doc in file_docs: if ":" in doc: doc = quote(doc) link = encodedcc.get_ENCODE(doc, connection).get("@id") if link: if link not in enc_docs: enc_docs.append(link) ####################### # begin lanes checking ####################### enc_lanes_check = [] file_lanes_check = [] flag = False for r in reviews: enc_lanes_check.append(r["lane"]) for item in objDict[idNum]: for l in item["lanes"]: file_lanes_check.append(int(l)) if len(set(enc_lanes_check)) < len(enc_lanes_check): # duplicate lanes in ENCODE print("Possible duplicate lanes in ENCODE") flag = True if len(set(file_lanes_check)) < len(file_lanes_check): # duplicate lanes in file print("Possible duplicate lanes in file") flag = True if len(set(enc_lanes_check) - set(file_lanes_check)) > 0: # more lanes in ENCODE than in file print("Found lanes in ENCODE not in the file") flag = True if len(set(file_lanes_check) - set(enc_lanes_check)) > 0: # more lanes in file than in ENCODE print("Found lanes in the file not in ENCODE") flag = True if flag: print( "Some problem was found with the number of lanes in the file as compared to ENCODE" ) print( "Do you want to continue running the program or exit and check the data?" ) i = input("Continue? y/n ") assert i.upper() == "Y" # exit the script for r in reviews: for line in objDict[idNum]: for lane in line["lanes"]: if int(lane) == r["lane"]: if line["lane_status"].lower( ) == "pending dcc review": print( "can't set to pending review, need manual override" ) fin = input( "Change the status to 'pending dcc review'? y/n " ) if fin.upper() == "Y": r["lane_status"] = line[ "lane_status"].lower() for link in enc_docs: if encodedcc.get_ENCODE( link, connection ).get("document_type", "") == "standards document": enc_docs.pop(link) else: pass else: r["lane_status"] = line["lane_status"].lower() # now all lanes in reviews should be updated to document enc_comp = 0 enc_ncomp = 0 other = 0 for r in reviews: if r.get("lane_status", "") == "compliant": enc_comp = enc_comp + 1 elif r.get("lane_status", "") == "not compliant": enc_ncomp = enc_ncomp + 1 else: other = other + 1 if other > 0: print( "not all lanes have allowed status, antibody characterization status set to not compliant" ) new_antibody["status"] = "not compliant" elif enc_comp > 0: new_antibody["status"] = "compliant" elif other == 0 and enc_comp == 0 and enc_ncomp > 0: new_antibody["status"] = "not compliant" ###################### # end lanes checking ###################### if antibody.get("lab", "") == "/labs/michael-snyder/": # make sure special document is added if not in the file if "michael-snyder:biorad_protein_standard" not in file_docs: file_docs.append("michael-snyder:biorad_protein_standard") if antibody[ "primary_characterization_method"] == "immunoprecipitation": if len(reviews) == 1: # fix lane number reviews[0]["lane"] = 3 new_antibody["characterization_reviews"] = reviews new_antibody["documents"] = enc_docs if args.update: new_antibody["reviewed_by"] = user if args.update: print("PATCHing antibody characterization", idNum) encodedcc.patch_ENCODE(idNum, connection, new_antibody) else: print("PATCH data:", new_antibody)
def main(): import argparse parser = argparse.ArgumentParser( description=__doc__, epilog=EPILOG, formatter_class=argparse.RawDescriptionHelpFormatter, ) parser.add_argument('--infile', '-i', help="File containing the JSON object as a JSON string.") parser.add_argument('--server', help="Full URL of the server.") parser.add_argument('--key', default='default', help="The keypair identifier from the keyfile. \ Default is --key=default") parser.add_argument('--keyfile', default=os.path.expanduser("~/keypairs.json"), help="The keypair file. Default is --keyfile\ =%s" % (os.path.expanduser("~/keypairs.json"))) parser.add_argument('--authid', help="The HTTP auth ID.") parser.add_argument('--authpw', help="The HTTP auth PW.") parser.add_argument('--force-put', default=False, action='store_true', help="Force the object to be PUT rather than PATCHed. \ Default is False.") parser.add_argument('--get-only', default=False, action='store_true', help="Do nothing but get the object and print it. \ Default is False.") parser.add_argument('--id', help="URI for an object"), parser.add_argument('--debug', default=False, action='store_true', help="Print debug messages. Default is False.") parser.add_argument('--frame', help="define a frame to get back the JSON object, for use with --id. Default is frame=object", default="object") parser.add_argument('--type', help="the object's type") args = parser.parse_args() global DEBUG_ON DEBUG_ON = args.debug if args.get_only: GET_ONLY = True else: GET_ONLY = False key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) new_object = False if args.id: GET_ONLY = True print("Taking id to get from --id") new_json = {} uuid_response = {} accession_response = {} try: id_response = encodedcc.get_ENCODE(args.id, connection, frame=args.frame) except: id_response = {} new_object = True else: if args.infile: infile = open(args.infile, 'r') else: infile = sys.stdin new_json_string = infile.read() new_json = json.loads(new_json_string) if '@id' in new_json: try: id_response = encodedcc.get_ENCODE(new_json['@id'], connection) except: id_response = {} new_object = True else: id_response = {} if 'uuid' in new_json: try: uuid_response = encodedcc.get_ENCODE(new_json['uuid'], connection) except: uuid_response = {} new_object = True else: uuid_response = {} if 'accession' in new_json: try: accession_response = encodedcc.get_ENCODE(new_json['accession'], connection) except: accession_response = {} new_object = True else: print("No identifier in new JSON object. Assuming POST or PUT with auto-accessioning.") new_object = True accession_response = {} object_exists = False if id_response: object_exists = True print("Found matching @id:") encodedcc.pprint_ENCODE(id_response) if uuid_response: object_exists = True print("Found matching uuid:") encodedcc.pprint_ENCODE(uuid_response) if accession_response: object_exists = True print("Found matching accession") encodedcc.pprint_ENCODE(accession_response) if id_response and uuid_response and (id_response != uuid_response): print("Existing id/uuid mismatch") if id_response and accession_response and (id_response != accession_response): print("Existing id/accession mismatch") if uuid_response and accession_response and (uuid_response != accession_response): print("Existing uuid/accession mismatch") if new_object and object_exists: print("Conflict: At least one identifier already exists and at least one does not exist") profiles = encodedcc.get_ENCODE("/profiles/", connection) supported_collections = list(profiles.keys()) if "Dataset" not in supported_collections: supported_collections.append("Dataset") type_list = new_json.pop('@type', []) if args.type: type_list = [args.type] if any(type_list): findit = False for x in supported_collections: if x.lower() == type_list[0].lower(): type_list = [x] findit = True if findit: if args.debug: print("Object will have type of", type_list[0]) else: print("Error! JSON object does not contain one of the supported types") print("Provided type:", type_list[0]) print("Please either change the JSON file or define the type with the --type feature") sys.exit(1) else: print("No type found for JSON object!") sys.exit(1) possible_collections = [x for x in type_list if x in supported_collections] if possible_collections: # collection = possible_collections[0] + 's/' collection = possible_collections[0] else: collection = [] if '@id' in new_json: identifier = new_json.pop('@id') elif 'uuid' in new_json: if collection: identifier = '/' + collection + '/' + new_json['uuid'] + '/' else: identifier = '/' + new_json['uuid'] + '/' elif 'accession' in new_json: if collection: identifier = '/' + collection + '/' + new_json['accession'] + '/' else: identifier = '/' + new_json['accession'] + '/' if 'attachment' in new_json: if 'href' in new_json['attachment']: pass else: try: filename = new_json['attachment']['download'] print("Setting filename to %s" % (filename)) except: print("Must specify either href or filename for attachment", file=sys.stderr) if new_json['attachment'].get('type'): mime_type = new_json['attachment'].get('type') else: try: mime_type, encoding = mimetypes.guess_type(filename) major, minor = mime_type.split('/') #detected_type = magic.from_file(filename, mime=True) print("Detected mime type %s" % (mime_type)) except: print("Failed to detect mime type in file %s" % (filename), file=sys.stderr) try: with open(filename, 'rb') as stream: print("opened") newvalue = { 'download': filename, # Just echoes the given filename as the download name 'type': mime_type, 'href': 'data:%s;base64,%s' % (mime_type, b64encode(stream.read())) } f = open('tmp', 'w') print(f, newvalue) new_json.update({'attachment': newvalue}) # add except: print("Cannot open file %s" % (filename), file=sys.stderr) if object_exists: if args.force_put: if not GET_ONLY: print("Replacing existing object") e = encodedcc.replace_ENCODE(identifier, connection, new_json) print(e) else: if not GET_ONLY: print("Patching existing object") e = encodedcc.patch_ENCODE(identifier, connection, new_json) print(e) elif new_object: if args.force_put: if not GET_ONLY: print("PUT'ing new object") e = encodedcc.replace_ENCODE(identifier, connection, new_json) print(e) else: if not GET_ONLY: print("POST'ing new object") if not any(collection): print("ERROR: Unable to POST to non-existing collection {}".format(collection)) sys.exit(1) e = encodedcc.new_ENCODE(connection, collection, new_json) print(e)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) print("Running on {}".format(connection.server)) accessions = [] if args.object: if os.path.isfile(args.object): accessions = [line.strip() for line in open(args.object)] else: accessions = args.object.split(",") elif args.query: if "search" in args.query: temp = encodedcc.get_ENCODE(args.query, connection).get("@graph", []) else: temp = [encodedcc.get_ENCODE(args.query, connection)] if any(temp): for obj in temp: if obj.get("accession"): accessions.append(obj["accession"]) elif obj.get("uuid"): accessions.append(obj["uuid"]) elif obj.get("@id"): accessions.append(obj["@id"]) else: print("ERROR: object has no identifier", file=sys.stderr) if len(accessions) == 0: print("No accessions to check!", file=sys.stderr) sys.exit(1) for acc in accessions: files = encodedcc.get_ENCODE(acc, connection).get("original_files", []) new_files = {} old_files = {} for f in files: file = encodedcc.get_ENCODE(f, connection) #renamer(file, connection, args.update) #replacer(file, connection, args.update) if any(file.get("aliases", [])): # this has aliases if file["aliases"][0].endswith("_replaced"): # this is one of the old ones dict_maker(file, old_files) else: # this is a new file dict_maker(file, new_files) else: print("file {} has no aliases".format(file["@id"])) for new in new_files.keys(): new_temp = new_files[new] for old in old_files.keys(): old_temp = old_files[old] if new_temp["replicate"] == old_temp["replicate"]: #print(new_temp["replicate"], old_temp["replicate"]) if new_temp["file_type"] == old_temp["file_type"]: #print(new_temp["file_type"], old_temp["file_type"]) if new_temp["run_type"] == old_temp["run_type"]: #print(new_temp["run_type"], old_temp["run_type"]) if new_temp["paired_end"] == old_temp["paired_end"]: #print(new_temp["paired_end"], old_temp["paired_end"]) print("New file {} with date {} replacing old file {} with date {}".format(new, new_temp["date"], old, old_temp["date"])) if args.update: #replace old file encodedcc.patch_ENCODE(old, connection, {"status": "replaced"}) # release and update new file patch_dict = {"status": "released", "alternate_accessions": [old]} encodedcc.patch_ENCODE(new, connection, patch_dict)
def main(): args = getArgs() key = encodedcc.ENC_Key(args.keyfile, args.key) connection = encodedcc.ENC_Connection(key) keypair = (key.authid, key.authpw) server = key.server query = args.query objects = \ encoded_get(server + 'search/?type=AntibodyLot' + '&type=Donor&type=Biosample' + '&type=File&type=Library' + '&type=Dataset&type=Pipeline' + '&type=Replicate' + '&type=Treatment&format=json&' + 'frame=object&limit=all&' + query, keypair)['@graph'] print('There are ' + str(len(objects)) + ' objects that should be inspected on the portal') counter = 0 for obj in objects: counter += 1 if counter % 1000 == 0: print('Script processed ' + str(counter) + ' objects') if obj['status'] not in ['replaced']: patching_data = {} # fixing links of donor fix_replaced_references(obj, 'parent_strains', patching_data, keypair, server) fix_replaced_references(obj, 'identical_twin', patching_data, keypair, server) fix_replaced_references(obj, 'outcrossed_strain', patching_data, keypair, server) fix_replaced_references(obj, 'littermates', patching_data, keypair, server) fix_replaced_references(obj, 'fraternal_twin', patching_data, keypair, server) fix_replaced_references(obj, 'parents', patching_data, keypair, server) fix_replaced_references(obj, 'children', patching_data, keypair, server) fix_replaced_references(obj, 'siblings', patching_data, keypair, server) # fixing links of file/experiment/biosample fix_replaced_references(obj, 'derived_from', patching_data, keypair, server) fix_replaced_references(obj, 'paired_with', patching_data, keypair, server) fix_replaced_references(obj, 'controlled_by', patching_data, keypair, server) fix_replaced_references(obj, 'possible_controls', patching_data, keypair, server) fix_replaced_references(obj, 'supersedes', patching_data, keypair, server) fix_replaced_references(obj, 'dataset', patching_data, keypair, server) fix_replaced_references(obj, 'related_files', patching_data, keypair, server) fix_replaced_references(obj, 'related_datasets', patching_data, keypair, server) # fixing links of biosample fix_replaced_references(obj, 'host', patching_data, keypair, server) fix_replaced_references(obj, 'part_of', patching_data, keypair, server) fix_replaced_references(obj, 'originated_from', patching_data, keypair, server) fix_replaced_references(obj, 'pooled_from', patching_data, keypair, server) fix_replaced_references(obj, 'donor', patching_data, keypair, server) # fixing links of library fix_replaced_references(obj, 'biosample', patching_data, keypair, server) # fixing links of treatment fix_replaced_references(obj, 'biosamples_used', patching_data, keypair, server) fix_replaced_references(obj, 'antibodies_used', patching_data, keypair, server) # fixing links of replicate fix_replaced_references(obj, 'antibody', patching_data, keypair, server) fix_replaced_references(obj, 'experiment', patching_data, keypair, server) fix_replaced_references(obj, 'library', patching_data, keypair, server) if patching_data: print('Patching object ' + obj['@type'][0] + '\t' + obj['uuid']) print('OLD DATA:') for k in patching_data: print('\t' + k + '\t' + str(obj[k])) print('---------') print('NEW DATA:') for k in patching_data: print('\t' + k + '\t' + str(patching_data[k])) print('---------') encodedcc.patch_ENCODE(obj['uuid'], connection, patching_data)