def start_toil(job): import pandas as pd work_dir = job.fileStore.getLocalTempDir() # in_store = IOStore.get("aws:us-east-1:molmimic-ibis") # # pdb_file = os.path.join(work_dir, "PDB.h5") # in_store.read_input_file("PDB.h5", pdb_file) # # sfams = pd.read_hdf(pdb_file, "Superfamilies", columns= # ["sfam_id"]).drop_duplicates().dropna()["sfam_id"].sort_values() sfams = [299845.0] map_job(job, calculate_features_for_sfam, sfams)
def calculate_features_for_sfam(job, sfam_id, further_parallelize=False): work_dir = job.fileStore.getLocalTempDir() pdb_store = IOStore.get("aws:us-east-1:molmimic-full-structures") out_store = IOStore.get("aws:us-east-1:molmimic-structure-features") extensions = set(["atom.npy", "residue.npy", "edges.gz"]) done_files = lambda k: set([f.rsplit("_", 1)[1] for f in \ out_store.list_input_directory(k)]) pdb_keys = [k for k in pdb_store.list_input_directory(str(int(sfam_id))) if \ k.endswith(".pdb") and extensions != done_files(os.path.splitext(k)[0])] if further_parallelize: map_job(job, calculate_features, pdb_keys) else: for pdb_key in pdb_keys: #pdb_store.list_input_directory(int(sfam_id)): calculate_features(job, pdb_key, work_dir=work_dir)
def start_toil(job, pdbFileStoreID=None): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("{}:molmimic-ibis".format(prefix)) out_store = IOStore.get("{}:molmimic-interfaces".format(prefix)) if pdbFileStoreID is None: #Download PDB info pdb_path = os.path.join(work_dir, "PDB.h5") in_store.read_input_file("PDB.h5", pdb_path) #Add pdb info into local job store pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_path) else: pdb_path = job.fileStore.readGlobalFile(pdbFileStoreID) ibis_obs_prefix = "IBIS_observed.h5" ibis_obs_path = os.path.join(work_dir, ibis_obs_prefix) in_store.read_input_file(ibis_obs_prefix, ibis_obs_path) #Add ibis info into local job store ibisObsFileStoreID = job.fileStore.writeGlobalFile(ibis_obs_path) #Choose which superfamilies to run, skip those already present skip_sfam = set([float(f.split("/", 1)[0]) for f in out_store.list_input_directory() \ if f.endswith(".observed_interactome")]) pdb = filter_hdf_chunks(unicode(ibis_obs_path), "ObsInt", columns=["mol_superfam_id"]).drop_duplicates() sfams = pdb[~pdb["mol_superfam_id"].isin(skip_sfam)][ "mol_superfam_id"].drop_duplicates().dropna().astype(int) print "Will run a total of {} SFAMS".format(len(sfams)) #Run all superfamilies map_job(job, get_observed_structural_interactome, sfams, pdbFileStoreID, ibisObsFileStoreID) #Cleanup job.addFollowOnJobFn(cleanup) os.remove(ibis_obs_path) os.remove(pdb_path)
def compare_sfams(job, useExisting=False, observed=True): work_dir = job.fileStore.getLocalTempDir() store = IOStore.get("aws:us-east-1:molmimic-missing-structures") all_missing = "missing_{}.h5".format( "observed" if observed else "inferred") all_missing_f = os.path.join(work_dir, all_missing) if not useExisting or not store.exists(all_missing): inf_store = IOStore.get("aws:us-east-1:molmimic-interfaces") ending = ".{}_interactome".format( "observed" if observed else "inferred") sfams = [k.split("/",1)[0] for k in inf_store.list_input_directory() \ if k.endswith(ending)] else: store.read_input_file(all_missing, all_missing_f) sfams = pd.read_hdf(all_missing_f, "table", columns=["sfam"])["sfams"].drop_duplicates() map_job(job, compare_sfam, sfams) job.addFollowOnJobFn(get_missing, observed=observed)
def merge_inferred_interactome(job, pdbFileStoreID): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] out_store = IOStore.get("aws:us-east-1:molmimic-interfaces") pdb_file = get_file(job, "PDB.h5", pdbFileStoreID) sfams = pd.read_hdf(pdb_file, "Superfamilies", columns= ["sfam_id"]).drop_duplicates()["sfam_id"] os.remove(pdb_file) skip_sfam = [s for s in sfams if out_store.exists( "{0}/{0}.inferred_interactome".format(s))] sfam_to_run = [s for s in sfams if out_store.exists( "{0}/{0}.observed_interactome".format(s)) and s not in skip_sfam] # all_sfam = [os.path.basename(f).split(".") for f in out_store.list_input_directory() if not f.endswith("failed")] # skip_sfam = [f[0] for f in all_sfam if f[1] == "inferred_interactome"] # sfam_to_run = [f[0] for f in all_sfam if f[1] == "observed_interactome" \ # and f[0] not in skip_sfam] map_job(job, merge_inferred_interactome_sfam, sfam_to_run) job.addFollowOnJobFn(cleanup, sfam_to_run)
def get_inferred_structural_interactome_by_table(job, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] in_store = IOStore.get("aws:us-east-1:molmimic-ibis") out_store = IOStore.get("aws:us-east-1:molmimic-interfaces") RealtimeLogger.info("Running table {}".format(table)) #Read in H5 for entire table tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), in_store) tableInfPathFileStoreID = job.fileStore.writeGlobalFile(tableInfPath) sfams = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table), columns=["nbr_superfam_id"]).drop_duplicates().dropna() skip_sfam = set([s for s in sfams["nbr_superfam_id"] if \ out_store.exists("{0}/{0}.inferred_interactome".format(int(s))) or \ not out_store.exists("{0}/{0}.observed_interactome".format(int(s)))]) # skip_sfam = set([int(f.split("/", 1)[0]) for f in out_store.list_input_directory() \ # if f.endswith(".inferred_interactome")]) sfams = sfams[~sfams["nbr_superfam_id"].isin(skip_sfam)] sfams = sfams["nbr_superfam_id"].drop_duplicates().dropna().astype(int).tolist() # partial_sfams = set(int(k.split("/")[0]) for sfam in sfams for k in \ # out_store.list_input_directory( # "{sfam}/_inftables/Intrac{table}_{sfam}.inferred_interactome".format( \ # sfam=sfam, table=table)) if not k.endswith("failed")) #sfams = list(set(sfams)-partial_sfams) if len(sfams) > 0: map_job(job, get_table_sfams, sfams, table, tableInfPathFileStoreID, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs) try: os.remove(tableInfPath) except OSError: pass
def get_observed_structural_interactome(job, sfam_id, pdbFileStoreID, ibisObsFileStoreID): work_dir = job.fileStore.getLocalTempDir() prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0] out_store = IOStore.get("{}:molmimic-interfaces".format(prefix)) ibis_obs_path = get_file(job, "IBIS_obs.h5", ibisObsFileStoreID) try: df = filter_hdf(ibis_obs_path, "ObsInt", "mol_superfam_id", float(sfam_id), columns=["obs_int_id"]) int_ids = df["obs_int_id"].drop_duplicates().dropna().astype(int) if len(int_ids) == 0: job.log("EMPTY OBS SFAM {}".format(sfam_id)) print "EMPTY OBS SFAM {}".format(sfam_id) return except (SystemExit, KeyboardInterrupt): raise except Exception as e: job.log("FAILED OBS SFAM {} {}".format(sfam_id, e)) print "FAILED OBS SFAM {} {}".format(sfam_id, e) return current_rows = set( int(os.path.basename(key)[:-3]) for key in out_store.list_input_directory("{}/_obsrows".format( int(sfam_id))) if not key.endswith("failed")) int_ids = list(set(int_ids) - current_rows) print "Will run {} ids: {}".format(len(int_ids), int_ids) if len(int_ids) > 0: #Add jobs for each interaction map_job(job, process_observed_interaction, int_ids, sfam_id, ibisObsFileStoreID, pdbFileStoreID) #Merge converted residues job.addFollowOnJobFn(merge_interactome_rows, sfam_id)