def findthemes(nothemes,wordlist,questionresponses,inc_articles,outputfile): #print questionresponses synonym_wordlists = [] synonym_wordlist = wordlist synonym_wordlists = synonym_wordlist.splitlines() exclude_wordlist = [] stop_path = "englishstop.txt" stop_words = surveythemer.loadStopWords(stop_path) surveyQuestionResponse = [] surveyQuestionResponseNID = [] for response in questionresponses: newresp = remove_html_tags(response["text"]) surveyQuestionResponse.append(newresp) surveyQuestionResponseNID.append(response["id"]) listOfAllWords, listOfSurveyQuestionWords, listOfAllSurveyQuestionTitles, stemdictionary = surveythemer.getItemWords(surveyQuestionResponse,stop_words) wordMatrix, listOfWordsIncluded, wordCount, fc, ic = surveythemer.createWordMatrix(listOfAllWords,listOfSurveyQuestionWords) pc = nothemes #size of input matrix ic=shape(wordMatrix)[0] fc=shape(wordMatrix)[1] # Random initialization w=array([[random.random() for j in range(pc)] for i in range(ic)]) h=array([[random.random() for i in range(fc)] for i in range(pc)]) nmfresult = "" themes = "" weights,themes = nmf.nmf(wordMatrix,w,h,0.001, 10, 500) themexml = surveythemer.display_themes(weights,themes,listOfWordsIncluded,surveyQuestionResponse, stemdictionary, wordCount, inc_articles, surveyQuestionResponseNID) f = open(outputfile, 'w') f.write(themexml) f.close() return
def consensus(V, rank, nloop, method): """ Calculate consensus matrix for columns of V Matrix V has the size of n rows and m columns n = shape[0] m = shape[1] """ shape = V.shape m = shape[1] consensus = np.zeros((m, m)) conn = np.zeros((m, m)) connac = np.zeros((m, m)) for l in range(nloop): if method == "pnmf": (W, H) = pnmf.projnmf(V, rank, 0.001, 50, 100) elif method == "nmf": (W, H) = nmf.nmf(V, rank, 0.001, 50, 100) elif method == "spnmf": (W, H) = spnmf.spnmf(V, rank, 0.001, 50, 100) elif method == "snmf": (W, H) = snmf.snmf(V, rank, 0.001, 50, 100) conn = connectivity(H) connac = connac + conn consensus = connac / float(nloop) return consensus
def multi_evaluation(method, filename, view_names, weights, norm = 'l2', seed="random", post = "direct", data_size=-1): """ Eval of baselines (k-means, nmf, svd) on the combined view of all views for multi-view clustering. :param: method (type: string). Supported methods include "k-means", "nmf", "svd" :param: filename (type: string). The path of the input multi-view data (.MAT format). :param: view_names (type: list<string>). View names of the input multi-view data. :param: weights (type: list<int>). The weight of each view to build the combined view. :param: norm (type: string). Normalization strategy on the input data. Supported norm methods include: 'l2': each item vector is normalized by its Euclidean length (i.e. l2 distance). 'l1': each item vector is normalized by its sum of all elements (i.e. l1 distance). 'l0': the whole matrix is normalized by the sum of all elements (i.e. l1 normalization on the whole matrix). :param seed (type: string). The initialization method in CoNMF. Values can be 'k-means' and 'random': 'k-means': initialize W and H matrix using k-means results. The details are seen in paper [1] Section 4.5 'random': randomly initialize W and H matrix. :param post (type: string). Post processing on W matrix (m*k) to generate clustering result. Values can be 'direct' and 'k-means': 'direct': for each item vector (m*1), use the element with largest value as its cluster assignment. 'k-means': perform k-means on W matrix to get cluster assignment. :param data_size (type: int). Select the first data_size items to run clustering algorithm. When the value is -1, the clustering algorithm is run on all items. This parameter is for a quick check of clustering algorithms in case the input data is too large. """ method = method.lower() if method not in ["k-means","kmeans","nmf","svd"]: print "Error! Wrong input method name." return None # weights can only be integers if len(view_names)!= len(weights): print "Error! Length of view_names is not equal to the length of weights!" return None datas, names, groundtruth, cluster_k = dl.loadMATdata(filename, view_names,data_size) data = dl.combineData(datas, weights, norm) NMIs,As,F1s = [],[],[] print "Running multi- %s(k=%d,norm=%s,seed=%s,post=%s) for %s, size = %s, #runs: %d" %(method,cluster_k,norm,seed,post,names,str(data.shape), n_runs) print "view_names = %s, weights = %s" %(str(names),str(weights)) i_run = 1 t0 = time() while i_run <= n_runs: t1 = time() if method == "kmeans" or method=="k-means": labels = kmeans(data, cluster_k, norm)[0] if method == "nmf": labels = nmf(data, cluster_k, norm, seed, post, groundtruth)[0] if method == "svd": labels = svd(data, cluster_k, norm, post) NMI,A,F1,P,RI,ARI = evaluation_scores(groundtruth,labels) print "\t %d-th run(time=%ds),<Acc, F1, NMI>\t%f\t%f\t%f" %(i_run,time()-t1,A,F1,NMI) NMIs.append(NMI) As.append(A) F1s.append(F1) i_run = i_run+1 print "Results of %d runs (mean,std_var):\n\t Acc: %f, %f\n\t F1 : %f, %f\n\t NMI: %f, %f" %(n_runs, np.mean(As),np.std(As),np.mean(F1s),np.std(F1s),np.mean(NMIs),np.std(NMIs)) print "Running time: %fs" %(time() - t0)
def plot_run(plotter): a = nmf.tf_idf(nmf.read_term_document()) terms = nmf.read_terms() abort_error = -500 max_iter = 500 num_terms = 3 errors = [] for c in range(2, 7): term_indices, iterations, best_w, e = nmf.nmf(a, abort_error, max_iter, c, num_terms) errors.append(e) plotter(errors)
def train(): """Train with the train set Train with the train set, and return the user_user_similarities array. Returns: user_user_similarities(Type: numpy.ndarray): The similarities between user and user(user_user_similarities[i, j] represents the similarity between user "i" and user "j". Similarity of oneself is "1".) user_ids(Type: numpy.ndarray(vector)): user's ids(from small to large) associated with the array """ # get user-news array user_news_array_of_train = numpy.load("user_news_array_of_train.npy") user_ids_of_train = numpy.load("user_ids_of_train.npy") user_news_array = user_news_array_of_train user_ids = user_ids_of_train user_num = len(user_ids) del user_news_array_of_train, user_ids_of_train # NMF V = numpy.float16(user_news_array.T) K = 15 # TODO refine this parameter W, H = nmf.nmf(V, K) del V, W # estimatedV = numpy.dot(W, H) # calculate similarity between users print("Calculate similarity between users started.") time_start = time.time() user_user_similarities = numpy.zeros((user_num, user_num), numpy.float16) H_norm = numpy.power(H, 2) H_norm = H_norm.sum(0) H_norm = numpy.sqrt(H_norm) # norm of each column vector in H H_norm = numpy.tile(H_norm, (K, 1)) eps = numpy.finfo(float).eps H_normalized = H / (H_norm + eps) H_normalized_transpose = H_normalized.transpose() computed_count = 0 compute_step = 1000 # to avoid MemoryError, only compute a part each time while computed_count < user_num: compute_upper_limit = min((computed_count + compute_step, user_num)) user_user_similarities[computed_count:compute_upper_limit, :] = numpy.dot(H_normalized_transpose[computed_count:compute_upper_limit, :], H_normalized) computed_count += compute_step del H, H_norm, H_normalized, H_normalized_transpose, computed_count, compute_step time_end = time.time() print("Calculate similarity between users ended. %f s cost." % (time_end - time_start)) print("[NMF-User-based Collaborative Filtering] Train finished!") return user_user_similarities, user_ids
def findthemes(nothemes,wordlist,question_responses,stop_words): synonym_wordlists = [] synonym_wordlist = wordlist synonym_wordlists = synonym_wordlist.splitlines() exclude_wordlist = [] surveyQuestionResponse = [] favourites = [] questionIDs = [] for response in question_responses: newresp = remove_html_tags(response[0]) for synliststring in synonym_wordlists: synlist = synliststring.split(",") mainsyn = synlist[0] for synword in synlist[1:len(synlist)]: newresp = newresp.replace(synword,mainsyn) #print newresp surveyQuestionResponse.append(newresp) favourites.append(response[1]) questionIDs.append(response[2]) listOfAllWords, listOfSurveyQuestionWords, listOfAllSurveyQuestionTitles, stemdictionary = surveythemer.getItemWords(surveyQuestionResponse,stop_words) wordMatrix, listOfWordsIncluded, wordCount, fc, ic = surveythemer.createWordMatrix(listOfAllWords,listOfSurveyQuestionWords) if (wordMatrix.shape[0] < nothemes+1): return '{"themesData": "Not enough data"}' #sys.exit("Not enough data") w,h = surveythemer.nndsvd(wordMatrix,nothemes,1) if not h.any(): return '{"themesData": "Empty"}' # sys.exit("Error message") nmfresult = "" themes = "" weights,themes = nmf.nmf(wordMatrix,w,h,0.001, 10, 500) json,data = surveythemer.display_themes(weights,themes,listOfWordsIncluded,surveyQuestionResponse, stemdictionary, wordCount, favourites, questionIDs) # print theme_html returnData = '{"themesData":'+ data +', "jsonData":'+ json + '}' return returnData
end_time = time.time() print('reading test data process finish time: %f' % (end_time - start_time)) training_set_normalize = preprocessing.normalize(training_set, axis=0) test_set_normalize = preprocessing.normalize(test_set, axis=0) #=============================== #nmf and knn #=============================== print('-×-×-×-NMF process begin-×-×-×-') start_time = time.time() ### NMF training_W, training_H = nmf(training_set_normalize, n_components, max_iter) end_time = time.time() print('NMF process end, time: %f' % (end_time - start_time)) test_H = nmf_keepW(test_set_normalize, training_W) training_H_trans = training_H.transpose() test_H_trans = test_H.transpose() knn(num_neighbors, training_H_trans, training_categorie, test_H_trans, test_categorie) #=============================== #new method #===============================
def getMainFeature(feature, data): """ 对df_train(test).csv进行特征提取 """ Log('...get main feature...') df = copy.deepcopy(data) # 诊断结果病种提取 case = df['出院诊断病种名称'].replace(np.nan, u' ').values df['case_length'] = [len(value) for value in case] df['心脏病'] = [[0, 10]['心脏病' in value] for value in case] df['肺心病'] = [[0, 4]['肺心病' in value] for value in case] df['高血压'] = [[0, 17]['高血压' in value] for value in case] df['冠心病'] = [[0, 10]['冠心病' in value] for value in case] df['挂号'] = [[0, 7]['挂号' in value] for value in case] df['门特挂号'] = [[0, 13]['门特挂号' in value] for value in case] df['糖尿病'] = [[0, 45]['糖尿病' in value] for value in case] df['尿毒症'] = [[0, 5]['尿毒症' in value] for value in case] df['偏瘫'] = [[0, 14]['偏瘫' in value] for value in case] df['精神病'] = [[0, 2]['精神病' in value] for value in case] df['是否残疾军人'] = 0 df.loc[(~df['残疾军人医疗补助基金支付金额'].isnull()) | (df['残疾军人医疗补助基金支付金额'] != 0), '是否残疾军人'] = 1 df['是否城乡救助'] = 0 df.loc[(~df['城乡救助补助金额'].isnull()) | (df['城乡救助补助金额'] != 0), '是否城乡救助'] = 6 df['是否公务员'] = 0 df.loc[(~df['公务员医疗补助基金支付金额'].isnull()) | (df['公务员医疗补助基金支付金额'] != 0), '是否公务员'] = 4 df['是否自负'] = 0 df.loc[(~df['起付标准以上自负比例金额'].isnull()) | (df['起付标准以上自负比例金额'] != 0), '是否自负'] = 1 df['是否民政救助'] = 0 df.loc[(~df['民政救助补助金额'].isnull()) | (df['民政救助补助金额'] != 0), '是否民政救助'] = 4 df['是否城乡优抚'] = 0 df.loc[(~df['城乡优抚补助金额'].isnull()) | (df['城乡优抚补助金额'] != 0), '是否城乡优抚'] = 3 # 诊断结果分隔符数量提取 df['case_has_type1'] = [len(value.split(',')) - 1 for value in case] df['case_has_type2'] = [len(value.split(',')) - 1 for value in case] df['case_has_type3'] = [len(value.split(';')) - 1 for value in case] df['case_has_type4'] = [len(value.split(';')) - 1 for value in case] df['case_has_type5'] = [len(value.split('(')) - 1 for value in case] df['case_has_type6'] = [len(value.split(' ')) - 1 for value in case] # 统计信息 case_col1 = [ '心脏病', '肺心病', '高血压', '冠心病', '挂号', '门特挂号', '糖尿病', '尿毒症', '偏瘫', '精神病' ] case_col_sp = ['是否残疾军人', '是否城乡救助', '是否公务员', '是否自负', '是否民政救助', '是否城乡优抚'] df['病_sum'] = df[case_col1].sum(axis=1) df['special_sum'] = df[case_col_sp].sum(axis=1) case_col2 = ['case_length','case_has_type1','case_has_type2','case_has_type3',\ 'case_has_type4','case_has_type5','case_has_type6','病_sum', 'special_sum'] money = [] no_money = [] for value in df.columns.values: if '金额' in value: money.append(value) else: no_money.append(value) # 非负矩阵分解,得到用户矩阵和项目费用矩阵,将用户矩阵作为新的特征 (20维) money_sum_df, money_mean_df = df.groupby(['uid'])[money].apply( np.sum).fillna(0), df.groupby(['uid'])[money].apply(np.mean).fillna(0) matdfs = {'MoneySum': money_sum_df, 'MoneyMean': money_mean_df} matvs = {k: item.copy().values for k, item in matdfs.items()} factor_num = 20 for k in matdfs: matdf, matv = matdfs[k], matvs[k] shape = matv.shape np.random.seed(20) initW, initH = np.random.rand(shape[0], factor_num), np.random.rand( factor_num, shape[1]) outW, outH = nmf(matv, initW, initH, 0.0001, 5555555, 250) feature_factor = pd.DataFrame( outW, index=pd.Index(data=matdf.index.values, copy=True, name='uid'), columns=[k + '_factor_' + str(i) for i in range(factor_num)]).reset_index() print(feature_factor.max(axis=0)) feature = pd.merge(feature, feature_factor, on='uid', how='left').fillna(0) # 获得用户记录条数 Log('get record number feature...') feature_rec = df.groupby(['uid'])['顺序号'].count() feature_rec = feature_rec.rename('record_num') feature_rec = feature_rec.reset_index() feature = pd.merge(feature, feature_rec, on='uid', how='left') # 获得用户去的不同医院数量 Log('get category feature...') feature_cat = df.groupby(['uid'])['医院编码'].nunique() feature_cat = feature_cat.rename('hospital_num') feature_cat = feature_cat.reset_index() feature = pd.merge(feature, feature_cat, on='uid', how='left') # 对连续型变量产生统计信息【mean, sum, median, min, max, std】 Log('get continuous feature...') col_con = money + case_col2 # col_con.append('医疗救助医院申请') # ,'医疗救助医院申请' feature_con = df.groupby(['uid'])[col_con].agg( [np.mean, np.sum, np.median, np.min, np.max, np.std]) feature_con = feature_con.reset_index() feature = pd.merge(feature, feature_con, on='uid', how='left') # 对类别变量产生统计信息【mean, sum, std】 Log('get dummy feature...') feature_dum = df.groupby(['uid'])[case_col1 + case_col_sp].agg( [np.mean, np.sum, np.std]) feature_dum = feature_dum.reset_index() feature = pd.merge(feature, feature_dum, on='uid', how='left') # 非负矩阵分解,得到用户矩阵和医院矩阵,将用户矩阵作为新的特征 (15维) factor_num = 15 PH_Mat = df.groupby(['uid', '医院编码']).size().unstack().fillna(0) PH_MatV = PH_Mat.copy().values shape = PH_MatV.shape np.random.seed(20) initW, initH = np.random.rand(shape[0], factor_num), np.random.rand( factor_num, shape[1]) outW, outH = nmf(PH_MatV, initW, initH, 0.0001, 5555555, 200) feature_factor = pd.DataFrame( outW, index=pd.Index(data=PH_Mat.index.values, copy=True, name='uid'), columns=['Hostpital_factor_' + str(i) for i in range(factor_num)]).reset_index() feature = pd.merge(feature, feature_factor, on='uid', how='left').fillna(0) return feature
def cluster(): return nmf.nmf();
[5, 3, 0, 1], [4, 0, 0, 1], [1, 1, 0, 5], [1, 0, 0, 4], [0, 1, 5, 4], ] V = [ [1, 1, 0, 1, 1], [1, 0, 0, 1, 0], [1, 1, 0, 1, 0], [1, 0, 0, 1, 0], [0, 1, 1, 1, 0], ] V = numpy.array(V) print("V = ") print(V) time_start = time.time() K = 2 W, H = nmf.nmf(V, K) time_end = time.time() estimatedV = numpy.dot(W, H) print("W = ") print(W) print("H = ") print(H) print("estimatedV = ") print(estimatedV) print(time_end - time_start)
# Example data - impute to replace missing values data = Orange.data.Table("yeast-class-RPR") imputer = Orange.feature.imputation.MinimalConstructor() imputer = imputer(data) data = imputer(data) # Convert to numpy and separate positive and negative values # Note that NMF works on strictly *nonnegative data* X, y, _ = data.to_numpy() Xp = X * (X > 0) Xn = X * (X < 0) * (-1) X = np.hstack([Xp, Xn]) # Unsupervised learning of training examples W, Hs = nmf([X], rank=rank) Xa = W.dot(Hs[0]) print Xa.shape plt.figure(figsize=(5.0, 5.0)) plt.imshow(X) plt.title("Original") plt.figure(figsize=(5.0, 5.0)) plt.title("Approximation") plt.imshow(Xa) # Show clusters if show_features: plt.figure() plt.title("Clusters")
show_features = 0 # Example data - impute to replace missing values data = Orange.data.Table("yeast-class-RPR") imputer = Orange.feature.imputation.MinimalConstructor() imputer = imputer(data) data = imputer(data) # Convert to numpy and separate positive and negative values # Note that NMF works on strictly *nonnegative data* X, y, _ = data.to_numpy() Xp = X * (X > 0) Xn = X * (X < 0) * (-1) X = np.hstack([Xp, Xn]) # Reshape and select target class y = y.reshape((len(X), 1)) y = (y == 0).astype(int) # Learn the model parameters given the attributes and the class values tr = range(0, len(X), 2) te = range(1, len(X), 2) W, Hs = nmf([X[tr], y[tr]], rank=rank) Wte = nmf_fix([X[te]], [Hs[0]], rank=rank) prediction = Wte.dot(Hs[1]) # Measure performance auc = roc_auc_score(y_score=prediction, y_true=y[te]) print "AUC: ", auc
data = Orange.data.Table("yeast-class-RPR") imputer = Orange.feature.imputation.MinimalConstructor() imputer = imputer(data) data = imputer(data) # Convert to numpy and separate positive and negative values # Note that NMF works on strictly *nonnegative data* X, y, _ = data.to_numpy() Xp = X * (X > 0) Xn = X * (X < 0) * (-1) X = np.hstack([Xp, Xn]) # Reshape and select target class y = y.reshape((len(X), 1)) y = (y == 0).astype(int) # Learn the model parameters given the attributes and the class values tr = range(0, len(X), 2) te = range(1, len(X), 2) W, Hs = nmf([X[tr], y[tr]], rank=rank) Wte = nmf_fix([X[te]], [Hs[0]], rank=rank) prediction = Wte.dot(Hs[1]) # Measure performance auc = roc_auc_score(y_score=prediction, y_true=y[te]) print "AUC: ", auc
binvals = array.array('f',A.flatten('F').tolist()) binvals.tofile(fp) fp.close() return path = '../data/' X = npy_from_file(path + 'X.bin') W = npy_from_file(path + 'W.bin') H = npy_from_file(path + 'H.bin') #print X #print W #print H nmf.nmf(X,W,H,100,1) #print W #print H #print np.dot(W,H)
Ay_position_a_guess_llh = xyz2llh(Ay_position_a_guess[0, 0], Ay_position_a_guess[0, 1], Ay_position_a_guess[0, 2]).xyz() F_zhd_a = Mobj_b_base_Error.saastamoinen_model( np.array([Ay_position_a_guess_llh[0]]), np.array([Ay_position_a_guess_llh[1]]), np.array([Ay_position_a_guess_llh[2]]), np.array([np.pi / 2]), 0) F_zhd_b = Mobj_b_base_Error.saastamoinen_model( np.array([Ay_position_b_llh[0]]), np.array([Ay_position_b_llh[1]]), np.array([Ay_position_b_llh[2]]), np.array([np.pi / 2]), 0) ''' Ay_nmf : b站nmf值 並修正Ay_geographical_distance_a,b ''' Ay_nmf_b = nmf.nmf(I_doy, Ay_position_b_llh[0], Ay_position_b_llh[1], Ay_position_b_llh[2], Ay_elevation) Ay_nmf_a = nmf.nmf(I_doy, Ay_position_a_guess_llh[0], Ay_position_a_guess_llh[1], Ay_position_a_guess_llh[2], Ay_elevation) Ay_geographical_distance_b += F_zhd_b * Ay_nmf_b Ay_geographical_distance_a += F_zhd_a * Ay_nmf_a Ay_L1_chose_a_minus_ra = Ay_L1_chose_a - Ay_geographical_distance_a Ay_C1_chose_a_minus_ra = Ay_C1_chose_a - Ay_geographical_distance_a Ay_L1_chose_b_minus_rb = Ay_L1_chose_b - Ay_geographical_distance_b Ay_C1_chose_b_minus_rb = Ay_C1_chose_b - Ay_geographical_distance_b Ay_L1_chose_SD_minus_rSD = Ay_L1_chose_a_minus_ra - Ay_L1_chose_b_minus_rb Ay_C1_chose_SD_minus_rSD = Ay_C1_chose_a_minus_ra - Ay_C1_chose_b_minus_rb
def recommend_skill_skill_matrix(user_skills, unique_skills): n = len(unique_skills.keys()) ss_matrix = np.zeros(n*n).reshape(n,n) skill_id_map = {} skill_name_map = {} skill_id = 0 for sk in unique_skills: skill_id_map[sk] = skill_id skill_name_map[skill_id] = sk skill_id +=1 for sk in user_skills: for a in range(0,len(sk)): for b in range(a+1,len(sk)): s1 = skill_id_map[sk[a]] s2 = skill_id_map[sk[b]] ss_matrix[s1][s2] += 1 ss_matrix[s2][s1] += 1 print 'Skill-skill matrix shape', ss_matrix.shape model = nmf('kl',10) W,H = model.fit(ss_matrix, 200, False) ss_matrix_approx = W.dot(H) print "Reconstructed matrix ", ss_matrix_approx for sk in skill_id_map: sid = skill_id_map[sk] val = ss_matrix_approx[sid,:] print sk.encode("utf-8"), " : " top5_h = [] for i in range(5): max_v = 0 max_j = 0 for j in range(len(val)): if val[j] > max_v and (j not in top5_h): max_v = val[j] max_j = j top5_h.append(max_j) val = ss_matrix_approx[:,sid].T top5_v = [] for i in range(5): max_v = 0 max_j = 0 for j in range(len(val)): if val[j] > max_v and (j not in top5_v): max_v = val[j] max_j = j top5_v.append(max_j) val = ss_matrix[sid,:] top5_o = [] for i in range(5): max_v = 0 max_j = 0 for j in range(len(val)): if val[j] > max_v and (j not in top5_o): max_v = val[j] max_j = j top5_o.append(max_j) for t1, t2, t3 in zip(top5_o,top5_h,top5_v): print " ", skill_name_map[t1].encode("utf-8"), " \t", skill_name_map[t2].encode("utf-8")
from numpy import * import numpy as np import nimfa as nm import nmf import plot_err m = 5 n = 5 W = mat(random.random((m, n))) V = np.array([W, W, W]) w2 = mat(random.random((5, 2))) h2 = mat(random.random((5, 2))) (wo, ho) = nmf.nmf(V[:, :, 0], w2, h2, 0.001, 10, 10) print((wo, ho))
binvals = array.array('f',A.flatten('F').tolist()) binvals.tofile(fp) fp.close() return path = '/home/ericb/projects/nmf/trunk/' X = npy_from_file(path + 'X.bin') W = npy_from_file(path + 'W.bin') H = npy_from_file(path + 'H.bin') #print X #print W #print H nmf.nmf(X,W,H,200,1) #print W #print H #print np.dot(W,H)