Пример #1
0
def predict_by_factorize(user_item):
    # svd = TruncatedSVD(n_components=3)
    #
    # svd.fit(user_item)
    # print(svd.singular_values_)
    user_item_mean = csr_matrix.mean(user_item, axis=1)
    user_item_normalized = user_item - user_item_mean
    U, sigma, V = svds(user_item_normalized, k=2)
    sigma = np.diag(sigma)
    pred = U.dot(sigma).dot(V) + user_item_mean.reshape(-1, 1)
    return pred
def top_mean_feats(Xtr, features_names, grp_ids=None, min_tfidf=0.1, top_n=25):
    ''' Return the top n features that on average are most important amongst documents in rows
        indentified by indices in grp_ids. '''
    if grp_ids:
        D = Xtr[grp_ids]
    else:
        D = Xtr

    # memory error, need to find a way of doing that for sparse matrices
    #D[D < min_tfidf] = 0
    tfidf_means = np.asarray(csr_matrix.mean(D, axis=0)).reshape(-1)
    return top_tfidf_feats(tfidf_means, features_names, top_n)
Пример #3
0
def SAC(G, l, c):
    # 转化为稀疏矩阵,记录顶点信息
    # logging.info('graph loaded')
    #G = nx.read_gml('/home/fanfan/ML_RUI/model/word_graph1.gml')
    Gm = nx.adjacency_matrix(G)
    logging.info('adjacency matrix loaded')
    save_sparse_csr("graphMatrix", Gm)
    nodeNum = {}
    i = 0
    for n in G.nodes_iter():
        nodeNum[i] = n
        i += 1
    with open("/home/fanfan/ML_RUI/model/nodesInfo.json", "a") as f:
        json.dump(nodeNum, f, ensure_ascii=False)
        f.write('\n')
    # random distance matrix
    walkpro = c * (1 - c)
    rdm = Gm.copy()
    rdmsum = walkpro * rdm
    #rdmsum = walkpro*rdm
    if l > 1:
        for mi in xrange(2, l + 1):
            rdm = rdm.dot(Gm)
            walkpro *= (1 - c)
            rdmsum += walkpro * rdm
            logging.info('walk %s finished' % (mi))
    mean = csr_matrix.mean(rdm)  #距离平均值
    #array = (rdmsum.data-mean)**2
    #var = array.sum()/rdmsum.data.size #距离方差
    rdmarray = rdmsum.toarray()
    IFdic = {}
    num = 0
    for row in rdmarray:
        sumf = 0.0
        for element in xrange(1, row.size):
            if num == element: continue
            fenl = 1 - np.e**(pow(row[element], 2) / 2 / mean / -mean)
            sumf += fenl
        wordname = nodeNum[num]
        #influence function dictionary
        IFdic[wordname] = sumf
        num += 1
    #IFdic = sorted(IFdic.items(), key=lambda d: d[1],reverse=True)
    with open('/home/fanfan/ML_RUI/model/clusters/IFdic_part03.json',
              'a') as outfile:
        json.dump(IFdic, outfile, ensure_ascii=False)
        outfile.write('\n')
Пример #4
0
def correlation_filter(p, all_vars, quantile_filter=0.25):
    """Calculates correlations between phenotype and variants,
    giving those that are above the specified quantile

    Args:
        p (pandas.DataFrame)
            Phenotype vector (n, 1)
        all_vars (scipy.sparse.csr_matrix)
            Narrow sparse matrix representation of all variants to fit to
            (rows = variants, columns = samples)
        quantile_filter (float)
            The quantile to discard at e.g. 0.25, retain top 75%

            [default = 0.25]

    Returns:
        cor_filter (numpy.array)
            The indices of variants passing the filter
    """
    # a = snp - mean(snp)
    # b = y - mean(y)
    # cor = abs(a%*%b / sqrt(sum(a^2)*sum(b^2)) )
    b = p.values - np.mean(p.values)
    sum_b_squared = np.sum(np.power(b, 2))

    # NOTE: I couldn't get this to multithread efficiently using sparse matrices...
    # might work if the matrix was divided into chunks of rows first, but maybe not
    # worth it as it's pretty quick anyway
    correlations = []
    for row_idx in tqdm(range(all_vars.shape[0]), unit="variants"):
        k = all_vars.getrow(row_idx)
        k_mean = csr_matrix.mean(k)
        if k_mean == 0:
            # avoid crashes due to an empty sparse vector
            correlations.append([np.nan])
        else:
            ab = k.dot(b) - np.sum(k_mean * b)
            sum_a_squared = k.dot(
                k.transpose()).data[0] - 2 * k_mean * csr_matrix.sum(k) + pow(
                    k_mean, 2) * all_vars.shape[1]
            cor = np.abs(ab / np.sqrt(sum_a_squared * sum_b_squared))
            correlations.append(cor)

    cor_filter = np.nonzero(
        correlations > np.percentile(correlations, quantile_filter * 100))[0]
    return (cor_filter)
Пример #5
0
def clusterTweets(algor, documents, feaVecs, clusterArg):
    docDist = None
    if algor == "kmeans" or algor == "default":
        # kmeans: fast, a little bit worse performance than agglomerative
        clusterModel = cluster.KMeans(n_clusters=clusterArg).fit(feaVecs)
        docDist = clusterModel.transform(feaVecs)
    elif algor == "affi":
        # affinity: too slow
        clusterModel = cluster.AffinityPropagation().fit(feaVecs)
        clusterCenters = clusterModel.cluster_centers_
        docDist = pairwise.euclidean_distances(
            feaVecs, clusterCenters)  #, squared=True)
    elif algor == "spec":
        # spectral: too slow
        clusterModel = cluster.SpectralClustering(
            n_clusters=clusterArg).fit(feaVecs)
    elif algor == "agg":
        #AgglomerativeClustering
        clusterModel = cluster.AgglomerativeClustering(
            n_clusters=clusterArg).fit(feaVecs)
    elif algor == "dbscan":
        clusterModel = cluster.DBSCAN(eps=clusterArg,
                                      min_samples=5,
                                      metric='euclidean',
                                      algorithm='auto',
                                      n_jobs=8).fit(feaVecs)

    tLabels = clusterModel.labels_
    cLabels = sorted(Counter(tLabels).keys())
    if -1 in cLabels:
        print "Cluster -1: ", list(tLabels).count(-1)
        cLabels.remove(-1)

    centroids = []
    for clbl in cLabels:
        dataIn = [item[0] for item in enumerate(tLabels) if item[1] == clbl]
        vecsIn = feaVecs[dataIn, :]
        if issparse(vecsIn):
            centroids.append(csr_matrix(csr_matrix.mean(vecsIn, axis=0)))
        centroids.append(np.mean(vecsIn, axis=0))
    if docDist is None:
        if issparse(centroids):
            centroids = vstack(centroids, format='csr')
        if not issparse(feaVecs):
            docDist = pairwise.euclidean_distances(feaVecs, centroids)
    return cLabels, tLabels, centroids, docDist
def calculate_features(twitter_users):
    for user in twitter_users:

        try:
            tfidf = TfidfVectorizer(min_df=1).fit_transform(user.tweets)
            pairwise_similarity = tfidf * tfidf.T
            user.tfidf = csr_matrix.mean(pairwise_similarity).item()
        except Exception:
            user.tfidf = 0.0
            pass

        if user.numb_followings > 0:
            user.ratio_follower_following = user.numb_followers / user.numb_followings
        else:
            user.ratio_follower_following = 0
        at_count = 0
        http_count = 0
        for tweet in user.tweets:
            at_count += tweet.count("@")
            http_count += tweet.count("http")
        user.count_at = at_count
        user.count_http = http_count
Пример #7
0
def clusterTweets(algor, documents, feaVecs, clusterArg):
    docDist = None
    if algor == "kmeans" or algor == "default":
        # kmeans: fast, a little bit worse performance than agglomerative
        clusterModel = cluster.KMeans(n_clusters=clusterArg).fit(feaVecs)
        docDist = clusterModel.transform(feaVecs)
    elif algor == "affi":
        # affinity: too slow
        clusterModel = cluster.AffinityPropagation().fit(feaVecs)
        clusterCenters = clusterModel.cluster_centers_
        docDist = pairwise.euclidean_distances(feaVecs, clusterCenters) #, squared=True)
    elif algor == "spec":
        # spectral: too slow
        clusterModel = cluster.SpectralClustering(n_clusters=clusterArg).fit(feaVecs)
    elif algor == "agg":
        #AgglomerativeClustering
        clusterModel = cluster.AgglomerativeClustering(n_clusters=clusterArg).fit(feaVecs)
    elif algor == "dbscan":
        clusterModel = cluster.DBSCAN(eps=clusterArg, min_samples=5, metric='euclidean', algorithm='auto', n_jobs=8).fit(feaVecs)

    tLabels = clusterModel.labels_
    cLabels = sorted(Counter(tLabels).keys())
    if -1 in cLabels: 
        print "Cluster -1: ", list(tLabels).count(-1)
        cLabels.remove(-1)
    
    centroids = []
    for clbl in cLabels:
        dataIn = [item[0] for item in enumerate(tLabels) if item[1] == clbl]
        vecsIn = feaVecs[dataIn, :]
        if issparse(vecsIn):
            centroids.append(csr_matrix(csr_matrix.mean(vecsIn, axis=0)))
        centroids.append(np.mean(vecsIn, axis=0))
    if docDist is None:
        if issparse(centroids):
            centroids = vstack(centroids, format='csr')
        if not issparse(feaVecs):
            docDist = pairwise.euclidean_distances(feaVecs, centroids)
    return cLabels, tLabels, centroids, docDist