def pcc_item_rating_pred(path, rating, method, k): start = time.time() name = 'pcc_item' data = extract_data(path) mtx = get_matrix(3).toarray() item_mtx = [] result = [] zero = np.where(~mtx.any(axis=0))[0] # get zero mtx[:, [zero]] = 0.00001 # prevent zero-devide #normalize pcc = (mtx.T - np.sum(mtx, axis=1)) / len(mtx) pcc /= np.linalg.norm(mtx, axis=1).T mtx = pcc.T if method == 'dot': item_mtx = dot_sim(mtx, name) elif method == 'cos': inputs = (mtx.T * np.linalg.norm(mtx, axis=1)).T item_mtx = cos_sim(inputs, name) #KNN for i in data: score = 0 item_id = i[0] #get item_id user_id = i[1] #get user_id item = item_mtx[item_id] #row knn = np.argsort(item, kind='heapsort')[::-1][0:k + 1] if item_id in knn: # delte query idx = np.where(knn == item_id) knn = np.delete(knn, idx) else: knn = np.delete(knn, len(knn) - 1) #get score if rating == 'mean': score = np.sum(np.take(mtx[:, user_id], knn.tolist())) / float(k) + 3 elif rating == 'weighted': knn_sim = item[knn] if np.sum(knn_sim) != 0: #prevent zero-devide weight = knn_sim / np.sum(knn_sim) score = np.sum( np.multiply(np.take(mtx[:, user_id], knn.tolist()), weight)) + 3 else: score = np.sum(mtx[:, user_id]) / np.size( np.nonzero(mtx[:, user_id])) + 3 result.append(score) write(result, name, rating, method, k) print('item_rating_pred {} {} {} time : {}'.format(method, rating, k, time.time() - start)) gold = golden() print("RMSE :", np.sqrt(np.mean(np.square(result - gold))))
def user_rating_pred(path, rating,method,k): start = time.time() name='user' data = extract_data(path) mtx = get_matrix(3).toarray() user_mtx = [] result = [] zero = np.where(~mtx.any(axis=0))[0] #get zero mtx[:, [zero]] = 0.00001 # prevent zero-devide if method =='dot': user_mtx = dot_sim(mtx,name) elif method=='cos': inputs=np.linalg.norm(mtx,axis=0)*mtx #normalize before cos_sim user_mtx = cos_sim(inputs,name)#honestly cos_sim is cosine similariy but input is normalized so same with cos_similarity for i in data: score = 0 mv_id = i[0] #get item_id user_id = i[1] #get user_id user = user_mtx[user_id] #get user knn = np.argsort(user,kind='heapsort')[::-1][0: k+1] if user_id in knn:# delte query i = np.where(knn == user_id) knn = np.delete(knn, i) else: knn = np.delete(knn, len(knn) - 1) #get score if rating == 'mean': score = (np.sum(np.take(mtx[mv_id, :], knn.tolist())) / float(k))+3 elif rating=='weighted': knn_sim = user[knn] if np.sum(knn_sim) != 0: weight = knn_sim / np.sum(knn_sim) #prevent zero-devide score = np.sum(np.multiply(np.take(mtx[mv_id, :], knn.tolist()), weight))+3 else: score = np.sum(mtx[mv_id, :]) / np.size(np.nonzero(mtx[mv_id, :]))+3 result.append(score) #print('start _writting') write(result,name,rating,method,k) print('user_rating_pred {} {} {} time : {}'.format(method,rating, k,time.time() - start)) gold=golden() print("RMSE :",np.sqrt(np.mean(np.square(result-gold))))
def main(): from optparse import OptionParser usage = "usage: %prog [options] -i [INPUT_MAF_FILE_FOLDER] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=2, default=None, help= "Enter mutation maf files path,if cancer vs normal the second arg is None." ) parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder.") # optional flags parser.add_option("-f", "--fdr", dest="fdr", nargs=1, default=0.05, help="FDR cut off") parser.add_option( "-s", "--step", dest="step", nargs=1, default=1, help= "The maximum number of interval genes allowed between Sub-pathway genes." ) parser.add_option("-c", "--minsize", dest="minsize", nargs=1, default=3, help="Sub-pathway minimum number of nodes.") parser.add_option( "-p", "--pathway", dest="pathway", nargs=1, default=None, help="KEGG human normal pathway filename(absolute path).") parser.add_option( "-g", "--gene", dest="gene", nargs=1, default=None, help= "gene information file, gene id and gene symbol,default is NCBI human_gene_info." ) parser.add_option("-n", "--nperm", dest="nperm", nargs=1, default=1000, help="random times") parser.add_option( "-m", "--symbol", dest="symbol", nargs=1, default=0, help="input 1:mutation maf file geneid is 0 but have symbol.") parser.add_option( "-a", "--sub", dest="sub", nargs=1, default=None, help= "Determine whether subpath extraction is performed separately.File absolute path." ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.out: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outfolder = myutils.folderparser(options.out) #get pathway information if not options.pathway: pathway_info = myselect.select_normal_pathway_gene() else: pathway_info = myutils.getpathway(options.pathway) # get sample mutation infromation mut = [] for t in options.input: if t != 'None': mut.append(myselect.select_mutation_gene(t, int(options.symbol))) # get gene information if not options.gene: gene_info = myutils.get_gene_info() else: gene_info = myutils.get_gene_info(options.gene) # run gid = myutils.getgid() hsa = myselect.select_human_pathway().set_index('pathway name') begin = """ ********************************************************************** * BEGIN * ********************************************************************** """ end = """ ********************************************************************** * END * ********************************************************************** """ if len(mut) == 1: # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # 高覆盖通路 # # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if not options.sub: tb = time.time() print('-' * 10, 'Non-Random Mutation-High-Cover Pathway', '-' * 10) print(begin) sig_pathway = myalgorithm.RSMP(mut[0], pathway_info, gid, int(options.nperm), float(options.fdr)) myutils.write(sig_pathway, os.path.join(outfolder, 'sig_pathway.xlsx')) te = time.time() print('Spend %.2f minute!' % ((te - tb) / 60)) print(end) else: sig_pathway = pd.read_excel(options.sub) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # common sub-pathways # # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ te = time.time() print('-' * 10, 'Mutation-High-Cover Sub-Pathway', '-' * 10) print(begin) Sub_Pathway = SubPathway.HighCoverSub(mut[0], gene_info, sig_pathway, hsa, outfolder, int(options.step), int(options.minsize)) myutils.write(Sub_Pathway, os.path.join(outfolder, 'Sub_Pathway.xlsx')) print('Spend %.2f minute' % ((time.time() - te) / 60)) print(end) else: # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # subtype-specific pathways # # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ if not options.sub: tb = time.time() print('-' * 15, 'Subtype Specificity Pathway', '-' * 15) print(begin) sig_pathway = myalgorithm.TSDP(mut[0], mut[1], pathway_info, cut_off=float(options.fdr)) myutils.write(sig_pathway, os.path.join(outfolder, 'sig_pathway.xlsx')) te = time.time() print('Spend %.2f minute!' % ((te - tb) / 60)) print(end) else: sig_pathway = pd.read_excel(options.sub) # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ # subtype-specific sub-pathways # # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ te = time.time() print('-' * 15, 'Subtype Specificity Sub-Pathway', '-' * 15) print(begin) Specific_Sub_Pathway = SubPathway.DistinctSub(mut, gene_info, sig_pathway, hsa, outfolder, int(options.step), int(options.minsize)) myutils.write(Specific_Sub_Pathway, os.path.join(outfolder, 'Specific_Sub_Pathway.xlsx')) print('Spend %.2f minute' % ((time.time() - te) / 60)) print(end)