def predict_by_factorize(user_item): # svd = TruncatedSVD(n_components=3) # # svd.fit(user_item) # print(svd.singular_values_) user_item_mean = csr_matrix.mean(user_item, axis=1) user_item_normalized = user_item - user_item_mean U, sigma, V = svds(user_item_normalized, k=2) sigma = np.diag(sigma) pred = U.dot(sigma).dot(V) + user_item_mean.reshape(-1, 1) return pred
def top_mean_feats(Xtr, features_names, grp_ids=None, min_tfidf=0.1, top_n=25): ''' Return the top n features that on average are most important amongst documents in rows indentified by indices in grp_ids. ''' if grp_ids: D = Xtr[grp_ids] else: D = Xtr # memory error, need to find a way of doing that for sparse matrices #D[D < min_tfidf] = 0 tfidf_means = np.asarray(csr_matrix.mean(D, axis=0)).reshape(-1) return top_tfidf_feats(tfidf_means, features_names, top_n)
def SAC(G, l, c): # 转化为稀疏矩阵,记录顶点信息 # logging.info('graph loaded') #G = nx.read_gml('/home/fanfan/ML_RUI/model/word_graph1.gml') Gm = nx.adjacency_matrix(G) logging.info('adjacency matrix loaded') save_sparse_csr("graphMatrix", Gm) nodeNum = {} i = 0 for n in G.nodes_iter(): nodeNum[i] = n i += 1 with open("/home/fanfan/ML_RUI/model/nodesInfo.json", "a") as f: json.dump(nodeNum, f, ensure_ascii=False) f.write('\n') # random distance matrix walkpro = c * (1 - c) rdm = Gm.copy() rdmsum = walkpro * rdm #rdmsum = walkpro*rdm if l > 1: for mi in xrange(2, l + 1): rdm = rdm.dot(Gm) walkpro *= (1 - c) rdmsum += walkpro * rdm logging.info('walk %s finished' % (mi)) mean = csr_matrix.mean(rdm) #距离平均值 #array = (rdmsum.data-mean)**2 #var = array.sum()/rdmsum.data.size #距离方差 rdmarray = rdmsum.toarray() IFdic = {} num = 0 for row in rdmarray: sumf = 0.0 for element in xrange(1, row.size): if num == element: continue fenl = 1 - np.e**(pow(row[element], 2) / 2 / mean / -mean) sumf += fenl wordname = nodeNum[num] #influence function dictionary IFdic[wordname] = sumf num += 1 #IFdic = sorted(IFdic.items(), key=lambda d: d[1],reverse=True) with open('/home/fanfan/ML_RUI/model/clusters/IFdic_part03.json', 'a') as outfile: json.dump(IFdic, outfile, ensure_ascii=False) outfile.write('\n')
def correlation_filter(p, all_vars, quantile_filter=0.25): """Calculates correlations between phenotype and variants, giving those that are above the specified quantile Args: p (pandas.DataFrame) Phenotype vector (n, 1) all_vars (scipy.sparse.csr_matrix) Narrow sparse matrix representation of all variants to fit to (rows = variants, columns = samples) quantile_filter (float) The quantile to discard at e.g. 0.25, retain top 75% [default = 0.25] Returns: cor_filter (numpy.array) The indices of variants passing the filter """ # a = snp - mean(snp) # b = y - mean(y) # cor = abs(a%*%b / sqrt(sum(a^2)*sum(b^2)) ) b = p.values - np.mean(p.values) sum_b_squared = np.sum(np.power(b, 2)) # NOTE: I couldn't get this to multithread efficiently using sparse matrices... # might work if the matrix was divided into chunks of rows first, but maybe not # worth it as it's pretty quick anyway correlations = [] for row_idx in tqdm(range(all_vars.shape[0]), unit="variants"): k = all_vars.getrow(row_idx) k_mean = csr_matrix.mean(k) if k_mean == 0: # avoid crashes due to an empty sparse vector correlations.append([np.nan]) else: ab = k.dot(b) - np.sum(k_mean * b) sum_a_squared = k.dot( k.transpose()).data[0] - 2 * k_mean * csr_matrix.sum(k) + pow( k_mean, 2) * all_vars.shape[1] cor = np.abs(ab / np.sqrt(sum_a_squared * sum_b_squared)) correlations.append(cor) cor_filter = np.nonzero( correlations > np.percentile(correlations, quantile_filter * 100))[0] return (cor_filter)
def clusterTweets(algor, documents, feaVecs, clusterArg): docDist = None if algor == "kmeans" or algor == "default": # kmeans: fast, a little bit worse performance than agglomerative clusterModel = cluster.KMeans(n_clusters=clusterArg).fit(feaVecs) docDist = clusterModel.transform(feaVecs) elif algor == "affi": # affinity: too slow clusterModel = cluster.AffinityPropagation().fit(feaVecs) clusterCenters = clusterModel.cluster_centers_ docDist = pairwise.euclidean_distances( feaVecs, clusterCenters) #, squared=True) elif algor == "spec": # spectral: too slow clusterModel = cluster.SpectralClustering( n_clusters=clusterArg).fit(feaVecs) elif algor == "agg": #AgglomerativeClustering clusterModel = cluster.AgglomerativeClustering( n_clusters=clusterArg).fit(feaVecs) elif algor == "dbscan": clusterModel = cluster.DBSCAN(eps=clusterArg, min_samples=5, metric='euclidean', algorithm='auto', n_jobs=8).fit(feaVecs) tLabels = clusterModel.labels_ cLabels = sorted(Counter(tLabels).keys()) if -1 in cLabels: print "Cluster -1: ", list(tLabels).count(-1) cLabels.remove(-1) centroids = [] for clbl in cLabels: dataIn = [item[0] for item in enumerate(tLabels) if item[1] == clbl] vecsIn = feaVecs[dataIn, :] if issparse(vecsIn): centroids.append(csr_matrix(csr_matrix.mean(vecsIn, axis=0))) centroids.append(np.mean(vecsIn, axis=0)) if docDist is None: if issparse(centroids): centroids = vstack(centroids, format='csr') if not issparse(feaVecs): docDist = pairwise.euclidean_distances(feaVecs, centroids) return cLabels, tLabels, centroids, docDist
def calculate_features(twitter_users): for user in twitter_users: try: tfidf = TfidfVectorizer(min_df=1).fit_transform(user.tweets) pairwise_similarity = tfidf * tfidf.T user.tfidf = csr_matrix.mean(pairwise_similarity).item() except Exception: user.tfidf = 0.0 pass if user.numb_followings > 0: user.ratio_follower_following = user.numb_followers / user.numb_followings else: user.ratio_follower_following = 0 at_count = 0 http_count = 0 for tweet in user.tweets: at_count += tweet.count("@") http_count += tweet.count("http") user.count_at = at_count user.count_http = http_count
def clusterTweets(algor, documents, feaVecs, clusterArg): docDist = None if algor == "kmeans" or algor == "default": # kmeans: fast, a little bit worse performance than agglomerative clusterModel = cluster.KMeans(n_clusters=clusterArg).fit(feaVecs) docDist = clusterModel.transform(feaVecs) elif algor == "affi": # affinity: too slow clusterModel = cluster.AffinityPropagation().fit(feaVecs) clusterCenters = clusterModel.cluster_centers_ docDist = pairwise.euclidean_distances(feaVecs, clusterCenters) #, squared=True) elif algor == "spec": # spectral: too slow clusterModel = cluster.SpectralClustering(n_clusters=clusterArg).fit(feaVecs) elif algor == "agg": #AgglomerativeClustering clusterModel = cluster.AgglomerativeClustering(n_clusters=clusterArg).fit(feaVecs) elif algor == "dbscan": clusterModel = cluster.DBSCAN(eps=clusterArg, min_samples=5, metric='euclidean', algorithm='auto', n_jobs=8).fit(feaVecs) tLabels = clusterModel.labels_ cLabels = sorted(Counter(tLabels).keys()) if -1 in cLabels: print "Cluster -1: ", list(tLabels).count(-1) cLabels.remove(-1) centroids = [] for clbl in cLabels: dataIn = [item[0] for item in enumerate(tLabels) if item[1] == clbl] vecsIn = feaVecs[dataIn, :] if issparse(vecsIn): centroids.append(csr_matrix(csr_matrix.mean(vecsIn, axis=0))) centroids.append(np.mean(vecsIn, axis=0)) if docDist is None: if issparse(centroids): centroids = vstack(centroids, format='csr') if not issparse(feaVecs): docDist = pairwise.euclidean_distances(feaVecs, centroids) return cLabels, tLabels, centroids, docDist