def bioGRID_homo_HPC_combine(): dataset_len = int( len([ *bg.parse_bioGRID( filename= './data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt', wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl', wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../") ][1].index) * 0.5) dataset = "bioGRID" datasetName = "bioGRID_homo" for folder in os.listdir("./resultData/h**o/"): if dataset in folder: for trial in range(6, 10): print("folder: {}, trial: {}".format(folder, trial)) filenames = os.listdir("./resultData/h**o/" + folder) ppiFiles = [i for i in filenames if "PPI" in i] curTrialFiles = [ i for i in ppiFiles if i.split("_")[-3] == str(trial) ] topPPIs, topScores = [], [] for i in curTrialFiles: with open("./resultData/h**o/{}/{}".format(folder, i), "r") as f: topPPIs += json.loads(f.read()) with open( "./resultData/h**o/{}/{}".format( folder, "_".join(i.split("_")[:-1]) + "_score.json"), "r") as f: topScores += json.loads(f.read()) # sort topPPIs, topScores = hr.sort_key_val( topPPIs, topScores) topPPIs = topPPIs[0:dataset_len] topScores = topScores[0:dataset_len] with open( "./resultData/h**o/{}/{}_{}_trimmedPPIs.json". format(folder, curTrialFiles[0].split("_")[0], datasetName), "a+") as f: f.write(json.dumps(topPPIs) + "\n") with open( "./resultData/h**o/{}/{}_{}_trimmedScores.json". format(folder, curTrialFiles[0].split("_")[0], datasetName), "a+") as f: f.write(json.dumps(topScores) + "\n")
def append_precRecMap_multiCore(fNames, predPPI, samplePPI, datasetClass, coreNo, isGGI=False, logging=False): if isGGI: i = 0 else: i = 1 fullPPISet = { 'bioGRID': [ list(ppi) for ppi in np.asarray([*bg.parse_bioGRID( root='../')][i][['nodeA', 'nodeB']]) ], 'STRING': [ list(ppi) for ppi in np.asarray([*string.parse_STRING( root='../')][i][['nodeA', 'nodeB']]) ], 'MINT': [ list(ppi) for ppi in np.asarray([*MINT.parse_MINT( root='../')][i][['nodeA', 'nodeB']]) ], 'IntAct_spoke': [ list(ppi) for ppi in np.asarray([ *IntAct.parse_IntAct(root='../', spokeModel=True) ][i][['nodeA', 'nodeB']]) ] } fullPrecRecMap = {} if not os.path.exists("./resultData/PRCurveMap.json"): with open("./resultData/PRCurveMap.json", "w") as f: f.write(json.dumps(fullPrecRecMap)) precRecMap = ppiLPred.precRecMap_multiCore( fNames, predPPI, samplePPI, [fullPPISet[i] for i in datasetClass], coreNo, logging) with open('./resultData/PRCurveMap.json', 'r') as f: fullPrecRecMap = json.loads(f.read()) fullPrecRecMap.update(precRecMap) with open('./resultData/PRCurveMap.json', 'w') as f: f.write(json.dumps(fullPrecRecMap))
def trim_ppi_result(fNames, datasetClass): # get only the top PPI & scores equal to the size of its original dataset trimNum = { 'bioGRID': int(len([*bg.parse_bioGRID(root='../')][1].index) * 0.5), 'STRING': int(len([*string.parse_STRING(root='../')][1].index) * 0.5), 'MINT': int(len([*MINT.parse_MINT(root='../')][1].index) * 0.5), 'IntAct_spoke': int( len([*IntAct.parse_IntAct(root='../', spokeModel=True) ][1].index) * 0.5) } if not os.path.exists('./resultData/trimmed_predPPIs.json'): with open('./resultData/trimmed_predPPIs.json', 'w') as f: pass with open('./resultData/trimmed_predScores.json', 'w') as f: pass for i in range(len(fNames)): predPPI, predScore = [], [] with open("./resultData/{}_PPI.json".format(fNames[i]), 'r') as f: for line in f.readlines(): predPPI.append( json.loads(line)[0:trimNum[datasetClass[i]]]) with open("./resultData/{}_score.json".format(fNames[i]), 'r') as f: for line in f.readlines(): predScore.append( json.loads(line)[0:trimNum[datasetClass[i]]]) predPPIs, predScores = {fNames[i]: predPPI}, {fNames[i]: predScore} with open('./resultData/trimmed_predPPIs.json', 'a+') as f: f.write(json.dumps(predPPIs) + "\n") with open('./resultData/trimmed_predScores.json', 'a+') as f: f.write(json.dumps(predScores) + "\n")
def trim_multiple_ppi_result(fNames, datasetClass, trialSize): trimNum = { 'bioGRID': int(len([*bg.parse_bioGRID(root='../')][1].index) * 0.5), 'STRING': int(len([*string.parse_STRING(root='../')][1].index) * 0.5), 'MINT': int(len([*MINT.parse_MINT(root='../')][1].index) * 0.5), 'IntAct_spoke': int( len([*IntAct.parse_IntAct(root='../', spokeModel=True) ][1].index) * 0.5) } if not os.path.exists('./resultData/trimmed_predPPIs.json'): with open('./resultData/trimmed_predPPIs.json', 'w') as f: pass with open('./resultData/trimmed_predScores.json', 'w') as f: pass for i in range(len(fNames)): predPPI, predScore = [], [] for j in range(trialSize): with open("./resultData/{}_{}_PPI.json".format(fNames[i], j), 'r') as f: predPPI.append( json.loads(f.read())[0:trimNum[datasetClass[i]]]) with open("./resultData/{}_{}_score.json".format(fNames[i], j), 'r') as f: predScore.append( json.loads(f.read())[0:trimNum[datasetClass[i]]]) predPPIs, predScores = {fNames[i]: predPPI}, {fNames[i]: predScore} with open('./resultData/trimmed_predPPIs.json', 'a+') as f: f.write(json.dumps(predPPIs) + "\n") with open('./resultData/trimmed_predScores.json', 'a+') as f: f.write(json.dumps(predScores) + "\n")
def bioGRID_homo_tenTrial_to_csvData(): bioGRID_GGI_df, bioGRID_PPI_df = bg.parse_bioGRID( filename= './data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt', wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl', wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../") dataset_len = int(int(len(bioGRID_PPI_df.index) * 0.5) * 0.1) geneToEntry = helper.uniprot_map() predPPIs = {} with open('./resultData/h**o/trimmed_predPPIs_homo.json', 'r') as f: for line in f.readlines(): predPPIs.update(json.loads(line)) baseTags = [ 'commonNeighbor', 'xyContrib_dualCN_uvJoin', 'CRA', 'Sim', 'CH2_L3', 'L3uvJoin' ] for dataset in ["bioGRID_homo"]: tags = [ "{}_tenTrial_{}".format(baseTag, dataset) for baseTag in baseTags ] for i in range(len(tags)): tag = tags[i] PPIsList = [[[geneToEntry[g[0]], geneToEntry[g[1]]] for g in j[:dataset_len] if g[0] in geneToEntry and g[1] in geneToEntry] for j in predPPIs[tag]] for j in range(len(PPIsList)): with open('./GoSemSimPrepData/{}_{}.csv'.format(tag, j), 'w') as f: f.write( "\n".join(['nodeA\tnodeB'] + ["\t".join(ppi) for ppi in PPIsList[j]]) + "\n")
# For ExactL3, generate as such. # we defined the simple penalization index as xyContrib in ppiLPred.helperFunc, and jaccard coefficient as dualCN # so, we pass this as arguments via our custom scoreArgs parameter # the scoreArg is arranged as a list, ['normFunc'=?, 'uvSpec'=?, 'xySpec'=?, 'uvContrib'=?, 'xyContrib'=?, 'dualCN'=?, 'uvJoin'=?] # for our case, it would be ['null', 'null', 'null', 'null', 'basic', 'basic', 'basic'] ExactL3_PPIs, ExactL3_Scores = ppiLPred._PPILinkPred( nodePairs, PPIr, scoringMethod="interStr", scoreArgs=['null', 'null', 'null', 'null', 'basic', 'basic', 'basic']) ExactL3_sortedPPIs, ExactL3_sortedScores = helper.sort_key_val( ExactL3_PPIs, ExactL3_Scores) print(ExactL3_sortedPPIs) print(ExactL3_sortedScores) print("") # example for accessing the database # available dataset: bioGRID, STRING, MINT, IntAct, HuRI import bioGRID as bg bg_GI, bg_PPI = bg.parse_bioGRID(root="./src") print(bg_PPI.head()) print(bg_GI.head()) # for other datasets: # import STRING, MINT, IntAct # STRING.parse_STRING(root="./src") # MINT.parse_MINT(root="./src") # IntAct.parse_IntAct(spokeModel=True, root="./src") # HuRI.parse_HuRI(root="./src") # by default they all returns Yeast dataset, please see the notebook and other provided data generating script to return the Human dataset, or just ask me
def trim_multiple_ppi_result(fNames, datasetClass, trialSize): bioGRID_homo = int( len([ *bg.parse_bioGRID( filename= './data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt', wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl', wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../") ][1].index) * 0.5) STRING_homo = int( len([ *string.parse_STRING( ppiFile='./data/STRING/9606.protein.links.v11.0.txt', typeFile='./data/STRING/9606.protein.actions.v11.0.txt', uniProtMap= './data/UniProt/uniprot-taxonomy_9606_STRING.tab', root='../', wFile_GGI='./data/parsed/STRING_homo_GGI.pkl', wFile_PPI='./data/parsed/STRING_homo_PPI.pkl') ][1].index) * 0.5) MINT_homo = int( len([ *MINT.parse_MINT( ppiFile='./data/MINT/species human', uniProtMap="./data/UniProt/uniprot-taxonomy_9606.tab", wFile_GGI='./data/parsed/MINT_homo_GGI.pkl', wFile_PPI='./data/parsed/MINT_homo_PPI.pkl', root="../") ][1].index) * 0.5) trimNum = { 'HuRI': int(len(HuRI.parse_HuRI(root='../').index) * 0.5), "bioGRID_homo": bioGRID_homo, "STRING_homo": STRING_homo, "MINT_homo": MINT_homo } if not os.path.exists('./resultData/h**o/trimmed_predPPIs.json'): with open('./resultData/h**o/trimmed_predPPIs.json', 'w') as f: pass with open('./resultData/h**o/trimmed_predScores.json', 'w') as f: pass for i in range(len(fNames)): predPPI, predScore = [], [] for j in range(trialSize): with open( "./resultData/h**o/{}_{}_PPI.json".format( fNames[i], j), 'r') as f: predPPI.append( json.loads(f.read())[0:trimNum[datasetClass[i]]]) with open( "./resultData/h**o/{}_{}_score.json".format( fNames[i], j), 'r') as f: predScore.append( json.loads(f.read())[0:trimNum[datasetClass[i]]]) predPPIs, predScores = {fNames[i]: predPPI}, {fNames[i]: predScore} with open('./resultData/h**o/trimmed_predPPIs.json', 'a+') as f: f.write(json.dumps(predPPIs) + "\n") with open('./resultData/h**o/trimmed_predScores.json', 'a+') as f: f.write(json.dumps(predScores) + "\n")
def append_precRecMap_multiCore(fNames, predPPI, samplePPI, datasetClass, coreNo, isGGI=False, logging=False): if isGGI: i = 0 else: i = 1 bioGRID_homo = [ list(ppi) for ppi in np.asarray([ *bg.parse_bioGRID( filename= './data/BioGRID/BIOGRID-ORGANISM-Homo_sapiens-3.5.187.tab2.txt', wFile_GGI='./data/parsed/BioGRID_homo_GGI.pkl', wFile_PPI='./data/parsed/BioGRID_homo_PPI.pkl', root="../") ][i][['nodeA', 'nodeB']]) ] STRING_homo = [ list(ppi) for ppi in np.asarray([ *string.parse_STRING( ppiFile='./data/STRING/9606.protein.links.v11.0.txt', typeFile='./data/STRING/9606.protein.actions.v11.0.txt', uniProtMap= './data/UniProt/uniprot-taxonomy_9606_STRING.tab', root='../', wFile_GGI='./data/parsed/STRING_homo_GGI.pkl', wFile_PPI='./data/parsed/STRING_homo_PPI.pkl') ][i][['nodeA', 'nodeB']]) ] MINT_homo = [ list(ppi) for ppi in np.asarray([ *MINT.parse_MINT( ppiFile='./data/MINT/species human', uniProtMap="./data/UniProt/uniprot-taxonomy_9606.tab", wFile_GGI='./data/parsed/MINT_homo_GGI.pkl', wFile_PPI='./data/parsed/MINT_homo_PPI.pkl', root="../") ][i][['nodeA', 'nodeB']]) ] fullPPISet = { 'HuRI': [ list(ppi) for ppi in np.asarray( HuRI.parse_HuRI(root='../')[['nodeA', 'nodeB']]) ], 'bioGRID_homo': bioGRID_homo, 'STRING_homo': STRING_homo, 'MINT_homo': MINT_homo } fullPrecRecMap = {} if not os.path.exists("./resultData/PRCurveMap_homo.json"): with open("./resultData/PRCurveMap_homo.json", "w") as f: f.write(json.dumps(fullPrecRecMap)) precRecMap = ppiLPred.precRecMap_multiCore( fNames, predPPI, samplePPI, [fullPPISet[i] for i in datasetClass], coreNo, logging) with open('./resultData/PRCurveMap_homo.json', 'r') as f: fullPrecRecMap = json.loads(f.read()) fullPrecRecMap.update(precRecMap) with open('./resultData/PRCurveMap_homo.json', 'w') as f: f.write(json.dumps(fullPrecRecMap))