示例#1
0
def start_toil(job):
    import pandas as pd
    work_dir = job.fileStore.getLocalTempDir()
    # in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    #
    # pdb_file = os.path.join(work_dir, "PDB.h5")
    # in_store.read_input_file("PDB.h5", pdb_file)
    #
    # sfams = pd.read_hdf(pdb_file, "Superfamilies", columns=
    #     ["sfam_id"]).drop_duplicates().dropna()["sfam_id"].sort_values()

    sfams = [299845.0]

    map_job(job, calculate_features_for_sfam, sfams)
def calculate_features_for_sfam(job, sfam_id, further_parallelize=False):
    work_dir = job.fileStore.getLocalTempDir()
    pdb_store = IOStore.get("aws:us-east-1:molmimic-full-structures")
    out_store = IOStore.get("aws:us-east-1:molmimic-structure-features")

    extensions = set(["atom.npy", "residue.npy", "edges.gz"])
    done_files = lambda k: set([f.rsplit("_", 1)[1] for f in \
        out_store.list_input_directory(k)])
    pdb_keys = [k for k in pdb_store.list_input_directory(str(int(sfam_id))) if \
        k.endswith(".pdb") and extensions != done_files(os.path.splitext(k)[0])]

    if further_parallelize:
        map_job(job, calculate_features, pdb_keys)
    else:
        for pdb_key in pdb_keys:  #pdb_store.list_input_directory(int(sfam_id)):
            calculate_features(job, pdb_key, work_dir=work_dir)
示例#3
0
def start_toil(job, pdbFileStoreID=None):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("{}:molmimic-ibis".format(prefix))
    out_store = IOStore.get("{}:molmimic-interfaces".format(prefix))

    if pdbFileStoreID is None:
        #Download PDB info
        pdb_path = os.path.join(work_dir, "PDB.h5")
        in_store.read_input_file("PDB.h5", pdb_path)

        #Add pdb info into local job store
        pdbFileStoreID = job.fileStore.writeGlobalFile(pdb_path)
    else:
        pdb_path = job.fileStore.readGlobalFile(pdbFileStoreID)

    ibis_obs_prefix = "IBIS_observed.h5"
    ibis_obs_path = os.path.join(work_dir, ibis_obs_prefix)
    in_store.read_input_file(ibis_obs_prefix, ibis_obs_path)

    #Add ibis info into local job store
    ibisObsFileStoreID = job.fileStore.writeGlobalFile(ibis_obs_path)

    #Choose which superfamilies to run, skip those already present
    skip_sfam = set([float(f.split("/", 1)[0]) for f in out_store.list_input_directory() \
        if f.endswith(".observed_interactome")])
    pdb = filter_hdf_chunks(unicode(ibis_obs_path),
                            "ObsInt",
                            columns=["mol_superfam_id"]).drop_duplicates()
    sfams = pdb[~pdb["mol_superfam_id"].isin(skip_sfam)][
        "mol_superfam_id"].drop_duplicates().dropna().astype(int)
    print "Will run a total of {} SFAMS".format(len(sfams))

    #Run all superfamilies
    map_job(job, get_observed_structural_interactome, sfams, pdbFileStoreID,
            ibisObsFileStoreID)

    #Cleanup
    job.addFollowOnJobFn(cleanup)
    os.remove(ibis_obs_path)
    os.remove(pdb_path)
示例#4
0
def compare_sfams(job, useExisting=False, observed=True):
    work_dir = job.fileStore.getLocalTempDir()
    store = IOStore.get("aws:us-east-1:molmimic-missing-structures")
    all_missing = "missing_{}.h5".format(
        "observed" if observed else "inferred")
    all_missing_f = os.path.join(work_dir, all_missing)

    if not useExisting or not store.exists(all_missing):
        inf_store = IOStore.get("aws:us-east-1:molmimic-interfaces")
        ending = ".{}_interactome".format(
            "observed" if observed else "inferred")
        sfams = [k.split("/",1)[0] for k in inf_store.list_input_directory() \
            if k.endswith(ending)]
    else:
        store.read_input_file(all_missing, all_missing_f)
        sfams = pd.read_hdf(all_missing_f, "table",
                            columns=["sfam"])["sfams"].drop_duplicates()

    map_job(job, compare_sfam, sfams)

    job.addFollowOnJobFn(get_missing, observed=observed)
def merge_inferred_interactome(job, pdbFileStoreID):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    out_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    pdb_file = get_file(job, "PDB.h5", pdbFileStoreID)
    sfams = pd.read_hdf(pdb_file, "Superfamilies", columns=
        ["sfam_id"]).drop_duplicates()["sfam_id"]
    os.remove(pdb_file)

    skip_sfam = [s for s in sfams if out_store.exists(
        "{0}/{0}.inferred_interactome".format(s))]

    sfam_to_run = [s for s in sfams if out_store.exists(
        "{0}/{0}.observed_interactome".format(s)) and s not in skip_sfam]

    # all_sfam = [os.path.basename(f).split(".") for f in out_store.list_input_directory() if not f.endswith("failed")]
    # skip_sfam = [f[0] for f in all_sfam if f[1] == "inferred_interactome"]
    # sfam_to_run = [f[0] for f in all_sfam if f[1] == "observed_interactome" \
    #     and f[0] not in skip_sfam]
    map_job(job, merge_inferred_interactome_sfam, sfam_to_run)
    job.addFollowOnJobFn(cleanup, sfam_to_run)
def get_inferred_structural_interactome_by_table(job, table, pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    in_store = IOStore.get("aws:us-east-1:molmimic-ibis")
    out_store = IOStore.get("aws:us-east-1:molmimic-interfaces")

    RealtimeLogger.info("Running table {}".format(table))

    #Read in H5 for entire table
    tableInfPath = get_file(job, "IBIS_inferred_{}.h5".format(table), in_store)
    tableInfPathFileStoreID = job.fileStore.writeGlobalFile(tableInfPath)

    sfams = filter_hdf_chunks(tableInfPath, "Intrac{}".format(table),
        columns=["nbr_superfam_id"]).drop_duplicates().dropna()
    skip_sfam = set([s for s in sfams["nbr_superfam_id"] if \
        out_store.exists("{0}/{0}.inferred_interactome".format(int(s))) or \
        not out_store.exists("{0}/{0}.observed_interactome".format(int(s)))])

    # skip_sfam = set([int(f.split("/", 1)[0]) for f in out_store.list_input_directory() \
    #    if f.endswith(".inferred_interactome")])

    sfams = sfams[~sfams["nbr_superfam_id"].isin(skip_sfam)]
    sfams = sfams["nbr_superfam_id"].drop_duplicates().dropna().astype(int).tolist()

    # partial_sfams = set(int(k.split("/")[0]) for sfam in sfams for k in \
    #     out_store.list_input_directory(
    #         "{sfam}/_inftables/Intrac{table}_{sfam}.inferred_interactome".format( \
    #         sfam=sfam, table=table)) if not k.endswith("failed"))

    #sfams = list(set(sfams)-partial_sfams)

    if len(sfams) > 0:
        map_job(job, get_table_sfams, sfams, table, tableInfPathFileStoreID,
            pdbFileStoreID, taxFileStoreID, sfamFileStoreIDs)

    try:
        os.remove(tableInfPath)
    except OSError:
        pass
示例#7
0
def get_observed_structural_interactome(job, sfam_id, pdbFileStoreID,
                                        ibisObsFileStoreID):
    work_dir = job.fileStore.getLocalTempDir()
    prefix = job.fileStore.jobStore.config.jobStore.rsplit(":", 1)[0]
    out_store = IOStore.get("{}:molmimic-interfaces".format(prefix))

    ibis_obs_path = get_file(job, "IBIS_obs.h5", ibisObsFileStoreID)
    try:
        df = filter_hdf(ibis_obs_path,
                        "ObsInt",
                        "mol_superfam_id",
                        float(sfam_id),
                        columns=["obs_int_id"])
        int_ids = df["obs_int_id"].drop_duplicates().dropna().astype(int)
        if len(int_ids) == 0:
            job.log("EMPTY OBS SFAM {}".format(sfam_id))
            print "EMPTY OBS SFAM {}".format(sfam_id)
            return
    except (SystemExit, KeyboardInterrupt):
        raise
    except Exception as e:
        job.log("FAILED OBS SFAM {} {}".format(sfam_id, e))
        print "FAILED OBS SFAM {} {}".format(sfam_id, e)
        return

    current_rows = set(
        int(os.path.basename(key)[:-3])
        for key in out_store.list_input_directory("{}/_obsrows".format(
            int(sfam_id))) if not key.endswith("failed"))
    int_ids = list(set(int_ids) - current_rows)
    print "Will run {} ids: {}".format(len(int_ids), int_ids)

    if len(int_ids) > 0:
        #Add jobs for each interaction
        map_job(job, process_observed_interaction, int_ids, sfam_id,
                ibisObsFileStoreID, pdbFileStoreID)

    #Merge converted residues
    job.addFollowOnJobFn(merge_interactome_rows, sfam_id)