Пример #1
0
    def bioGRID_homo_HPC_combine():
        dataset_len = int(
            len([
                *bg.parse_bioGRID(
                    filename=
                    './data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt',
                    wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl',
                    wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl',
                    root="../")
            ][1].index) * 0.5)

        dataset = "bioGRID"
        datasetName = "bioGRID_homo"
        for folder in os.listdir("./resultData/h**o/"):
            if dataset in folder:
                for trial in range(6, 10):
                    print("folder: {}, trial: {}".format(folder, trial))
                    filenames = os.listdir("./resultData/h**o/" + folder)
                    ppiFiles = [i for i in filenames if "PPI" in i]
                    curTrialFiles = [
                        i for i in ppiFiles if i.split("_")[-3] == str(trial)
                    ]

                    topPPIs, topScores = [], []
                    for i in curTrialFiles:
                        with open("./resultData/h**o/{}/{}".format(folder, i),
                                  "r") as f:
                            topPPIs += json.loads(f.read())
                        with open(
                                "./resultData/h**o/{}/{}".format(
                                    folder, "_".join(i.split("_")[:-1]) +
                                    "_score.json"), "r") as f:
                            topScores += json.loads(f.read())
                        # sort
                        topPPIs, topScores = hr.sort_key_val(
                            topPPIs, topScores)
                        topPPIs = topPPIs[0:dataset_len]
                        topScores = topScores[0:dataset_len]

                    with open(
                            "./resultData/h**o/{}/{}_{}_trimmedPPIs.json".
                            format(folder, curTrialFiles[0].split("_")[0],
                                   datasetName), "a+") as f:
                        f.write(json.dumps(topPPIs) + "\n")
                    with open(
                            "./resultData/h**o/{}/{}_{}_trimmedScores.json".
                            format(folder, curTrialFiles[0].split("_")[0],
                                   datasetName), "a+") as f:
                        f.write(json.dumps(topScores) + "\n")
Пример #2
0
    def append_precRecMap_multiCore(fNames,
                                    predPPI,
                                    samplePPI,
                                    datasetClass,
                                    coreNo,
                                    isGGI=False,
                                    logging=False):
        if isGGI: i = 0
        else: i = 1
        fullPPISet = {
            'bioGRID': [
                list(ppi)
                for ppi in np.asarray([*bg.parse_bioGRID(
                    root='../')][i][['nodeA', 'nodeB']])
            ],
            'STRING': [
                list(ppi)
                for ppi in np.asarray([*string.parse_STRING(
                    root='../')][i][['nodeA', 'nodeB']])
            ],
            'MINT': [
                list(ppi) for ppi in np.asarray([*MINT.parse_MINT(
                    root='../')][i][['nodeA', 'nodeB']])
            ],
            'IntAct_spoke': [
                list(ppi) for ppi in np.asarray([
                    *IntAct.parse_IntAct(root='../', spokeModel=True)
                ][i][['nodeA', 'nodeB']])
            ]
        }

        fullPrecRecMap = {}
        if not os.path.exists("./resultData/PRCurveMap.json"):
            with open("./resultData/PRCurveMap.json", "w") as f:
                f.write(json.dumps(fullPrecRecMap))

        precRecMap = ppiLPred.precRecMap_multiCore(
            fNames, predPPI, samplePPI, [fullPPISet[i] for i in datasetClass],
            coreNo, logging)
        with open('./resultData/PRCurveMap.json', 'r') as f:
            fullPrecRecMap = json.loads(f.read())
        fullPrecRecMap.update(precRecMap)
        with open('./resultData/PRCurveMap.json', 'w') as f:
            f.write(json.dumps(fullPrecRecMap))
Пример #3
0
    def trim_ppi_result(fNames, datasetClass):
        # get only the top PPI & scores equal to the size of its original dataset
        trimNum = {
            'bioGRID':
            int(len([*bg.parse_bioGRID(root='../')][1].index) * 0.5),
            'STRING':
            int(len([*string.parse_STRING(root='../')][1].index) * 0.5),
            'MINT':
            int(len([*MINT.parse_MINT(root='../')][1].index) * 0.5),
            'IntAct_spoke':
            int(
                len([*IntAct.parse_IntAct(root='../', spokeModel=True)
                     ][1].index) * 0.5)
        }

        if not os.path.exists('./resultData/trimmed_predPPIs.json'):
            with open('./resultData/trimmed_predPPIs.json', 'w') as f:
                pass
            with open('./resultData/trimmed_predScores.json', 'w') as f:
                pass

        for i in range(len(fNames)):
            predPPI, predScore = [], []
            with open("./resultData/{}_PPI.json".format(fNames[i]), 'r') as f:
                for line in f.readlines():
                    predPPI.append(
                        json.loads(line)[0:trimNum[datasetClass[i]]])
            with open("./resultData/{}_score.json".format(fNames[i]),
                      'r') as f:
                for line in f.readlines():
                    predScore.append(
                        json.loads(line)[0:trimNum[datasetClass[i]]])
            predPPIs, predScores = {fNames[i]: predPPI}, {fNames[i]: predScore}
            with open('./resultData/trimmed_predPPIs.json', 'a+') as f:
                f.write(json.dumps(predPPIs) + "\n")
            with open('./resultData/trimmed_predScores.json', 'a+') as f:
                f.write(json.dumps(predScores) + "\n")
Пример #4
0
    def trim_multiple_ppi_result(fNames, datasetClass, trialSize):
        trimNum = {
            'bioGRID':
            int(len([*bg.parse_bioGRID(root='../')][1].index) * 0.5),
            'STRING':
            int(len([*string.parse_STRING(root='../')][1].index) * 0.5),
            'MINT':
            int(len([*MINT.parse_MINT(root='../')][1].index) * 0.5),
            'IntAct_spoke':
            int(
                len([*IntAct.parse_IntAct(root='../', spokeModel=True)
                     ][1].index) * 0.5)
        }

        if not os.path.exists('./resultData/trimmed_predPPIs.json'):
            with open('./resultData/trimmed_predPPIs.json', 'w') as f:
                pass
            with open('./resultData/trimmed_predScores.json', 'w') as f:
                pass

        for i in range(len(fNames)):
            predPPI, predScore = [], []
            for j in range(trialSize):
                with open("./resultData/{}_{}_PPI.json".format(fNames[i], j),
                          'r') as f:
                    predPPI.append(
                        json.loads(f.read())[0:trimNum[datasetClass[i]]])
                with open("./resultData/{}_{}_score.json".format(fNames[i], j),
                          'r') as f:
                    predScore.append(
                        json.loads(f.read())[0:trimNum[datasetClass[i]]])
            predPPIs, predScores = {fNames[i]: predPPI}, {fNames[i]: predScore}
            with open('./resultData/trimmed_predPPIs.json', 'a+') as f:
                f.write(json.dumps(predPPIs) + "\n")
            with open('./resultData/trimmed_predScores.json', 'a+') as f:
                f.write(json.dumps(predScores) + "\n")
Пример #5
0
    def bioGRID_homo_tenTrial_to_csvData():
        bioGRID_GGI_df, bioGRID_PPI_df = bg.parse_bioGRID(
            filename=
            './data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt',
            wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl',
            wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl',
            root="../")
        dataset_len = int(int(len(bioGRID_PPI_df.index) * 0.5) * 0.1)
        geneToEntry = helper.uniprot_map()
        predPPIs = {}
        with open('./resultData/h**o/trimmed_predPPIs_homo.json', 'r') as f:
            for line in f.readlines():
                predPPIs.update(json.loads(line))

        baseTags = [
            'commonNeighbor', 'xyContrib_dualCN_uvJoin', 'CRA', 'Sim',
            'CH2_L3', 'L3uvJoin'
        ]
        for dataset in ["bioGRID_homo"]:
            tags = [
                "{}_tenTrial_{}".format(baseTag, dataset)
                for baseTag in baseTags
            ]
            for i in range(len(tags)):
                tag = tags[i]
                PPIsList = [[[geneToEntry[g[0]], geneToEntry[g[1]]]
                             for g in j[:dataset_len]
                             if g[0] in geneToEntry and g[1] in geneToEntry]
                            for j in predPPIs[tag]]
                for j in range(len(PPIsList)):
                    with open('./GoSemSimPrepData/{}_{}.csv'.format(tag, j),
                              'w') as f:
                        f.write(
                            "\n".join(['nodeA\tnodeB'] +
                                      ["\t".join(ppi)
                                       for ppi in PPIsList[j]]) + "\n")
Пример #6
0
# For ExactL3, generate as such.
# we defined the simple penalization index as xyContrib in ppiLPred.helperFunc, and jaccard coefficient as dualCN
# so, we pass this as arguments via our custom scoreArgs parameter
# the scoreArg is arranged as a list, ['normFunc'=?, 'uvSpec'=?, 'xySpec'=?, 'uvContrib'=?, 'xyContrib'=?, 'dualCN'=?, 'uvJoin'=?]
# for our case, it would be ['null', 'null', 'null', 'null', 'basic', 'basic', 'basic']
ExactL3_PPIs, ExactL3_Scores = ppiLPred._PPILinkPred(
    nodePairs,
    PPIr,
    scoringMethod="interStr",
    scoreArgs=['null', 'null', 'null', 'null', 'basic', 'basic', 'basic'])
ExactL3_sortedPPIs, ExactL3_sortedScores = helper.sort_key_val(
    ExactL3_PPIs, ExactL3_Scores)
print(ExactL3_sortedPPIs)
print(ExactL3_sortedScores)
print("")

# example for accessing the database
# available dataset: bioGRID, STRING, MINT, IntAct, HuRI
import bioGRID as bg
bg_GI, bg_PPI = bg.parse_bioGRID(root="./src")
print(bg_PPI.head())
print(bg_GI.head())

# for other datasets:
# import STRING, MINT, IntAct
# STRING.parse_STRING(root="./src")
# MINT.parse_MINT(root="./src")
# IntAct.parse_IntAct(spokeModel=True, root="./src")
# HuRI.parse_HuRI(root="./src")
# by default they all returns Yeast dataset, please see the notebook and other provided data generating script to return the Human dataset, or just ask me
Пример #7
0
    def trim_multiple_ppi_result(fNames, datasetClass, trialSize):
        bioGRID_homo = int(
            len([
                *bg.parse_bioGRID(
                    filename=
                    './data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt',
                    wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl',
                    wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl',
                    root="../")
            ][1].index) * 0.5)

        STRING_homo = int(
            len([
                *string.parse_STRING(
                    ppiFile='./data/STRING/9606.protein.links.v11.0.txt',
                    typeFile='./data/STRING/9606.protein.actions.v11.0.txt',
                    uniProtMap=
                    './data/UniProt/uniprot-taxonomy_9606_STRING.tab',
                    root='../',
                    wFile_GGI='./data/parsed/STRING_homo_GGI.pkl',
                    wFile_PPI='./data/parsed/STRING_homo_PPI.pkl')
            ][1].index) * 0.5)

        MINT_homo = int(
            len([
                *MINT.parse_MINT(
                    ppiFile='./data/MINT/species human',
                    uniProtMap="./data/UniProt/uniprot-taxonomy_9606.tab",
                    wFile_GGI='./data/parsed/MINT_homo_GGI.pkl',
                    wFile_PPI='./data/parsed/MINT_homo_PPI.pkl',
                    root="../")
            ][1].index) * 0.5)

        trimNum = {
            'HuRI': int(len(HuRI.parse_HuRI(root='../').index) * 0.5),
            "bioGRID_homo": bioGRID_homo,
            "STRING_homo": STRING_homo,
            "MINT_homo": MINT_homo
        }

        if not os.path.exists('./resultData/h**o/trimmed_predPPIs.json'):
            with open('./resultData/h**o/trimmed_predPPIs.json', 'w') as f:
                pass
            with open('./resultData/h**o/trimmed_predScores.json', 'w') as f:
                pass

        for i in range(len(fNames)):
            predPPI, predScore = [], []
            for j in range(trialSize):
                with open(
                        "./resultData/h**o/{}_{}_PPI.json".format(
                            fNames[i], j), 'r') as f:
                    predPPI.append(
                        json.loads(f.read())[0:trimNum[datasetClass[i]]])
                with open(
                        "./resultData/h**o/{}_{}_score.json".format(
                            fNames[i], j), 'r') as f:
                    predScore.append(
                        json.loads(f.read())[0:trimNum[datasetClass[i]]])
            predPPIs, predScores = {fNames[i]: predPPI}, {fNames[i]: predScore}
            with open('./resultData/h**o/trimmed_predPPIs.json', 'a+') as f:
                f.write(json.dumps(predPPIs) + "\n")
            with open('./resultData/h**o/trimmed_predScores.json', 'a+') as f:
                f.write(json.dumps(predScores) + "\n")
Пример #8
0
    def append_precRecMap_multiCore(fNames,
                                    predPPI,
                                    samplePPI,
                                    datasetClass,
                                    coreNo,
                                    isGGI=False,
                                    logging=False):
        if isGGI: i = 0
        else: i = 1

        bioGRID_homo = [
            list(ppi) for ppi in np.asarray([
                *bg.parse_bioGRID(
                    filename=
                    './data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt',
                    wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl',
                    wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl',
                    root="../")
            ][i][['nodeA', 'nodeB']])
        ]

        STRING_homo = [
            list(ppi) for ppi in np.asarray([
                *string.parse_STRING(
                    ppiFile='./data/STRING/9606.protein.links.v11.0.txt',
                    typeFile='./data/STRING/9606.protein.actions.v11.0.txt',
                    uniProtMap=
                    './data/UniProt/uniprot-taxonomy_9606_STRING.tab',
                    root='../',
                    wFile_GGI='./data/parsed/STRING_homo_GGI.pkl',
                    wFile_PPI='./data/parsed/STRING_homo_PPI.pkl')
            ][i][['nodeA', 'nodeB']])
        ]

        MINT_homo = [
            list(ppi) for ppi in np.asarray([
                *MINT.parse_MINT(
                    ppiFile='./data/MINT/species human',
                    uniProtMap="./data/UniProt/uniprot-taxonomy_9606.tab",
                    wFile_GGI='./data/parsed/MINT_homo_GGI.pkl',
                    wFile_PPI='./data/parsed/MINT_homo_PPI.pkl',
                    root="../")
            ][i][['nodeA', 'nodeB']])
        ]

        fullPPISet = {
            'HuRI': [
                list(ppi) for ppi in np.asarray(
                    HuRI.parse_HuRI(root='../')[['nodeA', 'nodeB']])
            ],
            'bioGRID_homo':
            bioGRID_homo,
            'STRING_homo':
            STRING_homo,
            'MINT_homo':
            MINT_homo
        }

        fullPrecRecMap = {}
        if not os.path.exists("./resultData/PRCurveMap_homo.json"):
            with open("./resultData/PRCurveMap_homo.json", "w") as f:
                f.write(json.dumps(fullPrecRecMap))

        precRecMap = ppiLPred.precRecMap_multiCore(
            fNames, predPPI, samplePPI, [fullPPISet[i] for i in datasetClass],
            coreNo, logging)
        with open('./resultData/PRCurveMap_homo.json', 'r') as f:
            fullPrecRecMap = json.loads(f.read())
        fullPrecRecMap.update(precRecMap)
        with open('./resultData/PRCurveMap_homo.json', 'w') as f:
            f.write(json.dumps(fullPrecRecMap))