Exemplo n.º 1
0
    def Train(self, X, Y):
        #TODO: Estimate Naive Bayes model parameters
        positive_indices = np.argwhere(Y == 1.0).flatten()
        negative_indices = np.argwhere(Y == -1.0).flatten()
        #Number of positive negative words
        self.num_positive_reviews = len(positive_indices)
        self.num_negative_reviews = len(negative_indices)
        #Count of positive and negative words
        self.count_positive = csr_matrix.sum(X[np.ix_(positive_indices)],
                                             axis=0)
        self.count_negative = csr_matrix.sum(X[np.ix_(negative_indices)],
                                             axis=0)
        #Total positive negative words
        self.total_positive_words = csr_matrix.sum(X[np.ix_(positive_indices)])
        self.total_negative_words = csr_matrix.sum(X[np.ix_(negative_indices)])
        #Denominator
        self.deno_pos = float(self.total_positive_words +
                              self.ALPHA * X.shape[1])
        self.deno_neg = float(self.total_negative_words +
                              self.ALPHA * X.shape[1])

        self.count_positive = (self.count_positive + self.ALPHA)
        self.count_negative = (self.count_negative + self.ALPHA)

        return
    def Train(self, X, Y):
        #TODO: Estimate Naive Bayes model parameters
        positive_indices = np.argwhere(Y == 1.0).flatten()
        negative_indices = np.argwhere(Y == -1.0).flatten()
        #num of positive reviews/ pFiles
        self.num_positive_reviews = len(positive_indices)
        #num of negative reviews/ nFiles
        self.num_negative_reviews = len(negative_indices)
        #array of positive counts for each word
        self.count_positive = csr_matrix.sum(X[np.ix_(positive_indices)],
                                             axis=0) + self.ALPHA
        #array of positive counts for each word
        self.count_negative = csr_matrix.sum(X[np.ix_(negative_indices)],
                                             axis=0) + self.ALPHA

        #total count for all positive words
        self.total_positive_words = np.sum(self.count_positive)
        #total count for all negative words
        self.total_negative_words = np.sum(self.count_negative)

        #Deno of P(c) Num of positive words + smoothing factor for all words
        self.deno_pos = self.total_positive_words + self.ALPHA * X.shape[1]
        #Deno of P(c) Num of negative words + smoothing factor for all words
        self.deno_neg = self.total_negative_words + self.ALPHA * X.shape[1]

        # self.count_positive = 1
        # self.count_negative = 1
        self.pos_recall = []
        self.pos_precision = []
        self.neg_recall = []
        self.neg_precision = []

        return
Exemplo n.º 3
0
    def Train(self, X, Y):
        #TODO: Estimate Naive Bayes model parameters
        positive_indices = np.argwhere(Y == 1.0).flatten()
        negative_indices = np.argwhere(Y == -1.0).flatten()

        self.num_positive_reviews = len(positive_indices)
        self.num_negative_reviews = len(negative_indices)

        self.count_P = np.ix_(positive_indices)
        self.count_N = np.ix_(negative_indices)

        self.count_positive = self.count_positive + csr_matrix.sum(
            X[self.count_P], axis=0) + self.ALPHA
        self.count_negative = self.count_negative + csr_matrix.sum(
            X[self.count_N], axis=0) + self.ALPHA

        self.total_positive_words = csr_matrix.sum(X[self.count_P])
        self.total_negative_words = csr_matrix.sum(X[self.count_N])

        self.deno_pos = float(self.total_positive_words +
                              self.ALPHA * X.shape[1])
        self.deno_neg = float(self.total_negative_words +
                              self.ALPHA * X.shape[1])

        return
Exemplo n.º 4
0
    def build_grad(self, v):

        try:

            logger.info("build_grad --------------> ")
            # Empirical counts, convert matrix to vector
            empirical_counts = np.squeeze(
                np.asarray(csr_matrix.sum(self.training_matrix, axis=0)))

            # Expected counts
            expected_counts = np.zeros(self.feature_vector_len)
            for (i, j) in self.all_ones_positions_in_the_feature_vector:
                mat = self.all_ones_positions_in_the_feature_vector[(i, j)]
                # Calculating all probabilities at once per each (i,j) --> x(i)
                nominators = np.exp(mat.dot(v))
                denominator = np.sum(nominators)
                prob = nominators / denominator
                expected_counts += mat.transpose().dot(prob)

            logger.info("build_grad <-------------- ")

            return -(empirical_counts - expected_counts - self.lambda_reg * v)

        except Exception as e:
            print e
    def __init__(self, weight_graph, labels):

        # 2D Array of weights
        if (np.amin(weight_graph) < 0):
            raise ValueError('Negative weights inputed')
        else:
            self.weight_matrix = csr_matrix(weight_graph)

        # Inputted labels. Unlabeled points should be -1 , classes ranging from 0 up
        if (len(labels.shape) == 1):
            self.class_labels = labels
            self.number_of_classes = np.amax(labels) + 1

        if (len(labels.shape) > 1):
            self.class_labels = labels
            self.number_of_classes = labels.shape[1]

        self.node_number = self.weight_matrix.shape[0]
        self.y_matrix_creation()

        # Stores degrees of nodes in array
        self.degrees = csr_matrix.sum(self.weight_matrix, axis=1)
        self.degrees = np.asarray(self.degrees).reshape(-1)
        for i in range(self.node_number):
            if (self.degrees[i] == 0):
                self.degrees[i] += np.nextafter(np.float32(0), np.float32(1))
Exemplo n.º 6
0
def training(num_words, X_train, y_train, alpha):
    '''
    This function trains the naive bayes algorithm by calculating the conditional
    and class probabilities'''

    # initialize probability matrix
    prob_mat = np.zeros([X_train.shape[1], 20])
    # calculate the relative frequencies
    freq_bins = np.asarray(np.bincount(y_train), dtype=float)
    prob_freq = np.log(freq_bins / np.sum(freq_bins))

    # for each class in the dataset
    for k in range(0, 20):
        # find the cases corresponding to that class
        y_train_k_ind = np.where(y_train == k)[0]

        # extract from the training matrix
        X_train_k = X_train[y_train_k_ind]

        # calculate the numerator (note that this is vectorized)
        numerator_k = np.array(csr_matrix.sum(X_train_k, axis=0) + alpha,
                               dtype=float)

        # the associated conditional probabilities
        log_probs_k = np.log(
            np.transpose(np.array(numerator_k / np.sum(numerator_k))))

        # insert in the storage matrix
        prob_mat[:, [k]] = log_probs_k

    return prob_freq, prob_mat
def eq1(Wcm, Wuu, Wl, H, T, ak, wf):
    # sku = 0.05
    # suu = 0.01
    # sl = 1
    # lamd = 100

    sku = 0.05
    suu = 0.01
    sl = 1
    lamd = 100

    X = csr.sum(Wcm, axis=1)  #numpy matrix
    Dcm = diags(X.A.ravel()).tocsr()

    X = csr.sum(Wuu, axis=1)  #numpy matrix
    Duu = diags(X.A.ravel()).tocsr()

    X = csr.sum(Wl, axis=1)  #numpy matrix
    Dl = diags(X.A.ravel()).tocsr()

    Lifm = csr.transpose(Dcm -
                         Wcm).dot(Dcm -
                                  Wcm) + suu * (Duu - Wuu) + sl * (Dl - Wl)
    # Lifm = suu*(Duu-Wuu) + sl*(Dl-Wl)

    A = Lifm + lamd * T + sku * H
    b = (lamd * T + sku * H).dot(wf)
    # print(csr.sum(b))
    M = diags(A.diagonal())
    # print(A.shape)
    # print(b.shape)
    alpha = cg(A,
               b,
               x0=wf,
               tol=1e-05,
               maxiter=100,
               M=None,
               callback=None,
               atol=None)
    # alpha = spsolve(A, b)
    # print(alpha)
    # print(type(alpha[0]))
    return alpha[0] * 255
    def apply_redundancy_penalty(self, selected_sentence, sentences):
        """
        Apply a redundancy penalty to all sentences based on the given selected sentence
        :param selected_sentence: the selected sentence
        :return: void
        """
        selected_vector = selected_sentence.vector

        for sentence in sentences:
            overlap = csr_matrix.sum((selected_vector != 0).multiply(sentence.vector != 0))
            counts = selected_vector.sum() + sentence.vector.sum()
            sentence.mead_score = sentence.mead_score - (overlap/counts)
def generarte_autocomplete_vocab():

    with open("vocab_to_ix.json", 'r') as f:
        data = json.load(f)
    vocab = [str(key) for key, val in data.iteritems()]
    f.close()

    with open('tfidf_mat.npz', 'r') as f1:
        tfidf_mat = load_npz(f1)
        co_occurence_mat = (tfidf_mat.T) * tfidf_mat
    f1.close()

    with open('tfidf_mat.npz', 'r') as f1:
        with open("ix_to_vocab.json", 'r') as f2:
            tfidf_mat = load_npz(f1)
            ix_to_vocab = json.load(f2)
            sum_arr = csr_matrix.sum(tfidf_mat, axis=0)
            x = np.argsort(sum_arr)
            words_arr = []
            for ix in range(116754):
                val = x[0, ix]
                word = ix_to_vocab[str(val)]
                words_arr.append(word.encode("utf8"))
    f1.close()
    f2.close()

    low_bound = len(words_arr) - 1000
    word_refined = words_arr[low_bound:]
    no_ints = [word for word in word_refined if not word.isdigit()]

    with open("vocab_to_ix.json", 'r') as f:
        with open("ix_to_vocab.json", 'r') as f2:
            ix_to_vocab = json.load(f2)
            vocab_to_ix = json.load(f)
            bigrams = []
            for word in no_ints:
                ix = vocab_to_ix[word]
                sorted_row = np.argsort(
                    co_occurence_mat[ix, :].toarray()[0])[::-1]
                #print(sorted_row)
                #First index is the same word, so take the second and third word
                ix1, ix2 = sorted_row[1], sorted_row[2]
                bigram_1 = word.encode("utf8") + " " + ix_to_vocab[str(
                    ix1)].encode("utf8")
                bigram_2 = word.encode("utf8") + " " + ix_to_vocab[str(
                    ix2)].encode("utf8")
                bigrams.append(bigram_1)
                bigrams.append(bigram_2)
    f.close()
    f2.close()

    with open("autocomplete_bigram_vocab.pickle", "wb") as outfile:
        pickle.dump(bigrams, outfile)
    def Train(self, X, Y):
        pos_indices = np.argwhere(Y == 1.0).flatten()
        neg_indices = np.argwhere(Y == -1.0).flatten()

        self.pos_rev = len(pos_indices)
        self.neg_rev = len(neg_indices)

        self.count_pos = csr_matrix.sum(X[np.ix_(pos_indices)],
                                        axis=0) + self.ALPHA
        self.count_neg = csr_matrix.sum(X[np.ix_(neg_indices)],
                                        axis=0) + self.ALPHA

        self.total_pos = csr_matrix.sum(X[np.ix_(pos_indices)])
        self.total_neg = csr_matrix.sum(X[np.ix_(neg_indices)])

        self.deno_pos = float(self.total_pos + self.ALPHA * X.shape[1])
        self.deno_neg = float(self.total_neg + self.ALPHA * X.shape[1])

        samples = self.samples
        valid = 0
        weight_trans = np.zeros([X.shape[1], 1])
        converged = 1
        for j in range(samples):
            term = (X[j].dot(weight_trans))
            valid = 0
            if (term > 0.0):
                valid = 1.0
            elif term < 0.0:
                valid = -1.0
            if Y[j] != valid:
                weight_trans += (Y[j] * X[j].transpose())
                self.for_avg_weight += Y[j]
                converged = 0
            if converged == 1:
                break
        self.weight = weight_trans.transpose()

        return
Exemplo n.º 11
0
	def matrix_get_topicSpecificRank(self, teleport_set, initial_rank_vector, 
		google_matrix):
		"""Calculates TopicSpecificRank of each node taking some related_pages 
		as `teleport_set`.

		This method works by applying power iteration until convergence
		or till iterations reach `MAX_ITERATIONS`, whichever happens first.
		
		[USAGE WARNING] : If graph is large, then sparse matrix may become
			huge and use up the entire RAM(which is not a condition to be in).
		...

		Parameters
		----------
		teleport_set : list of int
			List of pages to which a random walker in the web-graph can 
			teleport to.
			In TopicSpecificRank this set corresponds to pages of same topic.

		initial_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), 
			n is `node_num`]
			Ranks are distributed equally among all pages, initially.

		google_matrix : scipy.sparse.csr_matrix [shape = (n x n), n is 
						`node_num`]
			It contains proportion of rank that will propagate from a 
			page to another page.


		Returns
		-------
		final_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), 
			n is `node_num`]
			Contains TopicSpecificRank of each node in the web-graph.

		"""
		iterations = 0
		diff = math.inf
		teleport_set_size = len(teleport_set)
		final_rank_vector = SparseMatrix(np.zeros(self.node_num).transpose())

		while(iterations < self.MAX_ITERATIONS and diff > self.epsilon):
			new_rank_vector = google_matrix * initial_rank_vector

			leaked_rank = (1-SparseMatrix.sum(new_rank_vector))/
				teleport_set_size
			leaked_rank_vector = SparseMatrix(np.array([leaked_rank if node in
				teleport_set else 0 for node in range(self.node_num)])).
				transpose()
def squaredis(P,Cent):    
    d=Cent.shape[1]
    C=SM((Cent.shape[0],d+2))    
    C[:,1]=1      #C is defined just as in the algorithm you sent me.
    C[:,0] =SM.sum(SM.power(Cent, 2), 1)
    C[:,2:d+2]=Cent
    D=SM.dot(P,C.T)
    D=D.toarray()
    Tags=D.argmin(1)#finding the most close centroid for each point 
    if min(D.shape)>1:
        dists=D.min(1)
    else:
        dists=np.ravel(D)
    y=D.argmin(0)
    return dists,Tags,y 
Exemplo n.º 13
0
def _graph_node_vectorize(graph,
                          decomposition_funcs,
                          preprocessors=None,
                          nbits=16,
                          effective_radius=1,
                          cutoff_effective_radius_factor=2,
                          max_num_node_features=1,
                          weight=None,
                          type_of='shortest',
                          attribute_label=None):
    data_matrix = node_proximity_node_vectorize(
        graph, decomposition_funcs, preprocessors, nbits, effective_radius,
        cutoff_effective_radius_factor, max_num_node_features, weight, type_of,
        attribute_label)
    vec = csr_matrix(csr_matrix.sum(data_matrix, axis=0))
    return vec
Exemplo n.º 14
0
def correlation_filter(p, all_vars, quantile_filter=0.25):
    """Calculates correlations between phenotype and variants,
    giving those that are above the specified quantile

    Args:
        p (pandas.DataFrame)
            Phenotype vector (n, 1)
        all_vars (scipy.sparse.csr_matrix)
            Narrow sparse matrix representation of all variants to fit to
            (rows = variants, columns = samples)
        quantile_filter (float)
            The quantile to discard at e.g. 0.25, retain top 75%

            [default = 0.25]

    Returns:
        cor_filter (numpy.array)
            The indices of variants passing the filter
    """
    # a = snp - mean(snp)
    # b = y - mean(y)
    # cor = abs(a%*%b / sqrt(sum(a^2)*sum(b^2)) )
    b = p.values - np.mean(p.values)
    sum_b_squared = np.sum(np.power(b, 2))

    # NOTE: I couldn't get this to multithread efficiently using sparse matrices...
    # might work if the matrix was divided into chunks of rows first, but maybe not
    # worth it as it's pretty quick anyway
    correlations = []
    for row_idx in tqdm(range(all_vars.shape[0]), unit="variants"):
        k = all_vars.getrow(row_idx)
        k_mean = csr_matrix.mean(k)
        if k_mean == 0:
            # avoid crashes due to an empty sparse vector
            correlations.append([np.nan])
        else:
            ab = k.dot(b) - np.sum(k_mean * b)
            sum_a_squared = k.dot(
                k.transpose()).data[0] - 2 * k_mean * csr_matrix.sum(k) + pow(
                    k_mean, 2) * all_vars.shape[1]
            cor = np.abs(ab / np.sqrt(sum_a_squared * sum_b_squared))
            correlations.append(cor)

    cor_filter = np.nonzero(
        correlations > np.percentile(correlations, quantile_filter * 100))[0]
    return (cor_filter)
Exemplo n.º 15
0
    def train(self, X, y, word_vocab):
        """
        Train on the sparse document-term matrix X and associated labels y.
        In the test case below, p_wc is a class-term-matrix and has a row
        for each class and a column for each term. So the value at ij is
        the p_wc for the j-th term in the i-th class.
        p_c is an array of global probabilities for each class.
        >>> wv, cv = generate_vocab("example.txt")
        >>> X, y = read_labeled_data("example.txt", cv, wv)
        >>> nb = NaiveBayes(cv, wv)
        >>> nb.train(X, y, wv)
        >>> numpy.round(nb.p_wc, 3)
        array([[ 0.664,  0.336],
               [ 0.335,  0.665]])
        >>> numpy.round(nb.p_c, 3)
        array([ 0.5,  0.5])
        """

        total = numpy.unique(y)
        print(total)
        self.classes = total
        matrices = []
        for i, label in enumerate(total):
            self.p_c.append((y == label).sum() / len(y))
            matrices.append(X[y == label])

        for i, matrix in enumerate(matrices):
            column_summed_matrix = csr_matrix.sum(matrix, axis=0)
            n_vocab = len(word_vocab.keys())
            nC = column_summed_matrix.sum() + self.e * n_vocab
            print(nC)
            column_summed_matrix += self.e

            if i == 0:
                p_row = numpy.divide(column_summed_matrix, nC)

            else:
                p_row_onwards = numpy.divide(column_summed_matrix, nC)
                p_row = numpy.concatenate((p_row, p_row_onwards))
        print(p_row)
        self.p_wc = p_row
        self.log_p_wc = numpy.log(self.p_wc)
        print(self.log_p_wc)
    def fit(self, X, y):
        check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo', 'dok',
                        'bsr', 'lil', 'dia'])
        check_array(X, accept_sparse=['csc', 'csr', 'coo', 'dok',
                        'bsr', 'lil', 'dia'])
        self.X_ = X
        check_classification_targets(y)
        classes = np.nonzero(y)

        n_samples, n_classes = len(y), len(classes)
        # create diagonal matrix of degree of nodes
        if sparse.isspmatrix(self.X_):
            B_ = self.X_.copy().astype(np.float)
            D = np.array(csr_matrix.sum(self.X_, axis=1), dtype=np.float).T[0]
        else:
            B_ = np.copy(self.X_).astype(np.float)
            D = np.array(np.sum(self.X_, axis=1), dtype=np.float)

        # if  (- self.sigma) and (self.sigma - 1) doesn't equals we have different diagonal matrix at the left and right sides
        if (- self.sigma) == (self.sigma - 1):
            D_left = D_right = np.power(D, - self.sigma)
        else:
            D_left = np.power(D, - self.sigma)
            D_right = np.power(self.sigma - 1)

        # M_ = D_left.dot(B_)
        for i, d in enumerate(D_left):
            B_[i, :] *= d
        # B_ = M_.dot(D_right)
        for i, d in enumerate(D_right):
            B_[:, i] *= d
        # create labeled data Z
        dimension = (n_samples, n_classes)
        labels = np.nonzero(y)
        ans_y = np.zeros(dimension)
        for l in labels[0]:
            ans_y[l][y[l] - 1] = 1

        Z_ = (self.sigma / (1 + self.sigma)) * ans_y
        self.initial_vector_ = np.ones(dimension) / n_classes
        self._get_method_(B_, Z_)
        return self
Exemplo n.º 17
0
def spectral_cluster(G, node_list):
    # G is a similarity matrix
    S = nx.to_scipy_sparse_matrix(G, nodelist=node_list)

    previous_sum_cut = 0
    previous_cluster_node = {}
    previous_cluster_label = {}
    for i in range(2, 100):
        labels = spectral_clustering(S, n_clusters=i)
        labels = labels.tolist()
        # print(labels)
        result_cluster_node = dict(zip(node_list, labels))
        result_cluster_label = {}
        for k in result_cluster_node:
            v = result_cluster_node[k]
            if v in result_cluster_label:
                result_cluster_label.get(v).add(k)
            else:
                result_cluster_label[v] = {k}
        # print(result_cluster_label)
        sum_cut = 0
        for k in result_cluster_label:
            cut_k = 0
            vol_k = 0
            v = result_cluster_label[k]
            for nk in v:
                set_not_k = set(node_list).difference(v)
                vol_k += csr_matrix.sum(S.getcol(node_list.index(nk)))
                # print(nk, S.getcol(cited_list.index(nk)).toarray().tolist())
                for notk in set_not_k:
                    cut_k += G.get_edge_data(nk,notk,default={"weight":0})["weight"]
            # print(cut_k, vol_k)
            sum_cut += (cut_k/vol_k)

        if sum_cut > previous_sum_cut != 0 or i == 99:
            print(i, sum_cut, result_cluster_label)
            return {"result_by_node": previous_cluster_node, "result_by_cluster": previous_cluster_label}
            break
        else:
            previous_cluster_node = result_cluster_node
            previous_cluster_label = result_cluster_label
            previous_sum_cut = sum_cut
Exemplo n.º 18
0
    def gradient(self, w):
        """
        this method calculates the gradient of the weight vector

        :param w:
        :return:
        """

        # empirical counts that are converted from matrix to vector
        empirical_counts = np.squeeze(
            np.asarray(csr_matrix.sum(self.model.train_feature_matrix,
                                      axis=0)))

        # expected counts
        expected_counts = np.zeros(self.model.feature_vector_len)
        for node_idx, node_matrix in self.model.possible_genres_per_node_matrix.items(
        ):
            nominators = np.exp(node_matrix.dot(w))
            denominator = np.sum(nominators)
            prob = nominators / denominator
            expected_counts += node_matrix.transpose().dot(prob)

        return -(empirical_counts - expected_counts - self.lambda_ * w)
def char_counts(df):
    #This function creates columns for each character and counts the times it
    #appears during each study session/day
    
    print 'Calculating time since character last read, etc...'    
    
    #Need to create corpus of characters found in all text_read    
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(decode_error = 'strict', analyzer = 'char')
    corpus = df.loc[:,'text_read']
    dtm = vectorizer.fit_transform(corpus)

    
    import numpy as np  
    from itertools import chain
    import datetime
    from scipy.sparse import csr_matrix
    
    n = df.shape[0]
    df.loc[:, 'percent_seen'] = 0.0
    df.loc[:, 'mean_days_since'] = 0.0
    df.loc[:, 'mean_term_freq'] = 0.0
    for i in range(1, n):   #cycle through all rows except first row
        ##Get percent of characters not seen in text so far        
        prior_non_zero = dtm[:i,:].nonzero()    #Find non-zero values in sparse matrix in (i-1) records
        before_chars = np.unique(prior_non_zero[1])  #Get list of all characters that have been seen so far
        current_chars = np.sort(dtm[i,:].nonzero()[1]) #Find non-zero characters in current record as column #'s    
        #http://stackoverflow.com/questions/28901311/numpy-find-index-of-elements-in-one-array-that-occur-in-another-array
        matching_current_index = np.where(np.in1d(current_chars, before_chars))[0]
        df.loc[i,'percent_seen'] = float(matching_current_index.shape[0])/float(current_chars.shape[0])
        
        ##Get mean days since characters last read (for those already seen in text)        
        #http://stackoverflow.com/questions/10252766/python-numpy-get-array-locations-of-a-list-of-values
        #http://stackoverflow.com/questions/11860476/how-to-unnest-a-nested-list         
        
        #gets list of tuple arrays (1 array per char in matching chars) where each array gives the indices of 
        #prior_non_zero where that character can be found        
        matching_chars = current_chars[matching_current_index]
        prior_array_indices = [np.where(prior_non_zero[1] == k) for k in list(matching_chars)]
        prior_array_indices = list(chain(*prior_array_indices))
        last_date_indices = map(lambda x: max(x), prior_array_indices)        
        last_date_rows = prior_non_zero[0][last_date_indices]
        current_date = df.loc[i,'date']
        days_since_seen = map(lambda x: current_date - x, df.loc[last_date_rows, 'date'])
        df.loc[i,'mean_days_since'] = (sum(days_since_seen, datetime.timedelta(0)).total_seconds()
                                            / 86400.0 / (len(days_since_seen)))
    
        ##Get mean frequency of document terms in the corpus so far
        #    NOT including the text read during the study session
        denominator = float(csr_matrix.sum(dtm[:i,:]))
        numerator = csr_matrix.sum(dtm[:i, matching_current_index])
        df.loc[i, 'mean_term_freq'] = numerator / denominator        
    
    #Normalize the current features
    norm_feat_list = ['cum_time', 'cum_char', 'mean_days_since']
    df = normalize_features(df, norm_feat_list)    
    
    #Create interaction terms with cumulative time and character count features
    df.loc[:,'timeXper_seen'] = df.loc[:, 'norm_cum_time'] *  df.loc[:,'percent_seen'] 
    df.loc[:,'timeXdays_since'] = df.loc[:, 'norm_cum_time'] *  df.loc[:,'norm_mean_days_since']
    df.loc[:,'timeXterm_freq'] = df.loc[:, 'norm_cum_time'] *  df.loc[:,'mean_term_freq']
    
    return df                                
Exemplo n.º 20
0
Pnmatrix1 = PN_cell_towers.as_matrix(columns=None)
i = 0
matrix = np.array([])

while i < Pnmatrix1.shape[0]: # 0-191
    move1 = Pnmatrix1[i, :]
    matrix = np.append(matrix, np.sqrt(((move1[0] - Pnmatrix1[:, 0]) * 110.90444)**2 + ((move1[1] - Pnmatrix1[:, 1]) * 93.45318)**2))
    i = i + 1

newmatrix = np.reshape(matrix, (Pnmatrix1.shape[0], Pnmatrix1.shape[0]))
sA = csr_matrix(newmatrix)
Tcsr = minimum_spanning_tree(sA)
Tcsr = Tcsr.toarray()
Tcsr = csr_matrix(Tcsr)
print(csr_matrix.sum(Tcsr))
cellnetPN = csr_matrix.sum(Tcsr)

Pnmatrix2 = NB_cell_towers.as_matrix(columns=None)
j = 0
matrix2 = np.array([])

while j < Pnmatrix2.shape[0]: # 0-191
    move2 = Pnmatrix2[j, :]
    matrix2 = np.append(matrix2, np.sqrt(((move2[0] - Pnmatrix2[:, 0]) * 110.57483725)**2 + ((move2[1] - Pnmatrix2[:, 1]) * 111.29134198)**2))
    j = j + 1

newmatrix2 = np.reshape(matrix2, (Pnmatrix2.shape[0], Pnmatrix2.shape[0]))

sA2 = csr_matrix(newmatrix2)
Tcsr2 = minimum_spanning_tree(sA2)
Exemplo n.º 21
0
save_object_tofile("y_train.csv", y_train.values)
save_object_tofile("y_test.csv", y_test.values)
tmpe = Xt.dot(X.T)
save_sparse_csr("similarity.npz", tmpe)
del tmpe

start = timeit.default_timer()
l = []
for i in range(Xt.shape[0]):
    print(i)
    ve = Xt[i].dot(X.T)
    r = []
    w = []
    e = []
    for m in range(ve.getnnz()):
        if ve.data[m] >= E:
            j = ve.indices[m]
            dist = csr_matrix.sum(csr_matrix.power(Xt[i] - X[j], 2))
            w.append(1 / dist)
            r.append(y_train.values[j])  #y_train.index
            e.append(ve.data[m])
    l.append(heapq.nlargest(K, zip(w, r, e), key=lambda s: s[0]))
    del r, w, e, ve
save_object_tofile("tuple_list.dat", l)
stop = timeit.default_timer()
print("Time:", stop - start)

rst = predict_analyse("tuple_list.dat")
y_predicted = [i[3] for i in rst]
print_test_result(y_test, y_predicted)
#save_result("result.dat",y_predicted)
def compute_support(m):
    N = m.shape[1]
    return csr.sum(m)/(N*((N-1)/2))
Exemplo n.º 23
0
def mcmc(G,
         iter,
         nburn,
         w0=False,
         beta=False,
         n=False,
         u=False,
         sigma=False,
         c=False,
         t=False,
         tau=False,
         x=False,
         hyperparams=False,
         wnu=False,
         all=False,
         sigma_sigma=0.01,
         sigma_c=0.01,
         sigma_t=0.01,
         sigma_tau=0.01,
         sigma_x=0.01,
         a_t=200,
         b_t=1,
         epsilon=0.01,
         R=5,
         w_inference='HMC',
         save_every=1000,
         init='none',
         index=None):

    size = G.number_of_nodes()
    prior = G.graph['prior'] if 'prior' in G.graph else print(
        'You must specify a prior as attribute of G')
    gamma = G.graph['gamma'] if 'gamma' in G.graph else print(
        'You must specify spatial exponent gamma as attribute of G')
    size_x = G.graph['size_x'] if 'size_x' in G.graph else print(
        'You must specify size_x as attribute of G')

    if hyperparams is True or all is True:
        sigma = c = t = tau = True
        if prior == 'singlepl':
            tau = False
    if wnu is True or all is True:
        w0 = beta = n = u = x = True
        if prior == 'singlepl':
            beta = False
    if sigma is True:
        sigma_est = [init['sigma_init']
                     ] if 'sigma_init' in init else [float(np.random.rand(1))]
    else:
        sigma_est = [G.graph['sigma']]
    if c is True:
        c_est = [init['c_init']
                 ] if 'c_init' in init else [float(5 * np.random.rand(1) + 1)]
    else:
        c_est = [G.graph['c']]
    if t is True:
        t_est = [init['t_init']] if 't_init' in init else [
            float(np.random.gamma(a_t, 1 / b_t))
        ]
    else:
        t_est = [G.graph['t']]
    if prior == 'doublepl':
        if tau is True:
            tau_est = [init['tau_init']] if 'tau_init' in init else [
                float(5 * np.random.rand(1) + 1)
            ]
        else:
            tau_est = [G.graph['tau']]
    else:
        tau_est = [0]

    z_est = [(size * sigma_est[0] / t_est[0]) ** (1 / sigma_est[0])] if G.graph['prior'] == 'singlepl' else \
                 [(size * tau_est[0] * sigma_est[0] ** 2 / (t_est[0] * c_est[0] ** (sigma_est[0] * (tau_est[0] - 1)))) ** \
                 (1 / sigma_est[0])]

    if w0 is True:
        if 'w0_init' in init:
            w0_est = [init['w0_init']]
        else:
            g = np.random.gamma(1 - sigma_est[0], 1, size)
            unif = np.random.rand(size)
            w0_est = [
                np.multiply(
                    g,
                    np.power(((z_est[0] + c_est[0])**sigma_est[0]) *
                             (1 - unif) + (c_est[0]**sigma_est[0]) * unif,
                             -1 / sigma_est[0]))
            ]
    else:
        w0_est = [
            np.array([G.nodes[i]['w0'] for i in range(G.number_of_nodes())])
        ]
    if prior == 'doublepl' and beta is True:
        beta_est = [init['beta_init']] if 'beta_init' in init else [
            float(np.random.beta(sigma_est[0] * tau_est[0], 1))
        ]
    if prior == 'singlepl' or beta is False:
        beta_est = [np.array([G.nodes[i]['beta'] for i in range(G.number_of_nodes())])] if 'beta' in G.nodes[0] \
            else [np.ones((size))]
    if u is True:
        u_est = [init['u_init']] if 'u_init' in init else [
            tp.tpoissrnd(z_est[0] * w0_est[0])
        ]
    else:
        u_est = [
            np.array([G.nodes[i]['u'] for i in range(G.number_of_nodes())])
        ]
    if x is True:
        x_est = [init['x_init']
                 ] if 'x_init' in init else [size_x * np.random.rand(size)]
        p_ij_est = [aux.space_distance(x_est[-1], gamma)]
    else:
        if gamma != 0:
            x_est = [
                np.array([G.nodes[i]['x'] for i in range(G.number_of_nodes())])
            ]
            p_ij_est = [aux.space_distance(x_est[-1], gamma)]
        else:
            p_ij_est = [np.ones((size, size))]
    if 'ind' in G.graph:
        ind = G.graph['ind']
    else:
        ind = {k: [] for k in G.nodes}
        for i in G.nodes:
            for j in G.adj[i]:
                if j > i:
                    ind[i].append(j)
    if 'selfedge' in G.graph:
        selfedge = G.graph['selfedge']
    else:
        selfedge = [i in ind[i] for i in G.nodes]
        selfedge = list(compress(G.nodes, selfedge))
    if n is True:
        if 'n_init' in init:
            n_est = [init['n_init']]
        else:
            out_n = up.update_n(w0_est[0], G, size, p_ij_est[-1], ind,
                                selfedge)
            n_est = [out_n[0]]
    else:
        n_est = [G.graph['counts']]

    w_est = [np.exp(np.log(w0_est[0]) - np.log(beta_est[0]))]

    adj = n_est[-1] > 0

    log_post_param_est = [
        aux.log_post_params(prior, sigma_est[-1], c_est[-1], t_est[-1],
                            tau_est[-1], w0_est[-1], beta_est[-1], u_est[-1],
                            a_t, b_t)
    ]
    sum_n = np.array(
        csr_matrix.sum(n_est[-1], axis=0) +
        np.transpose(csr_matrix.sum(n_est[-1], axis=1)))[0]
    log_post_est = [
        aux.log_post_logwbeta_params(prior,
                                     sigma_est[-1],
                                     c_est[-1],
                                     t_est[-1],
                                     tau_est[-1],
                                     w_est[-1],
                                     w0_est[-1],
                                     beta_est[-1],
                                     n_est[-1],
                                     u_est[-1],
                                     p_ij_est[-1],
                                     a_t,
                                     b_t,
                                     gamma,
                                     sum_n,
                                     adj,
                                     log_post_par=log_post_param_est[-1])[0]
    ]
    print('log post initial', log_post_est[-1])

    accept_params = [0]
    accept_hmc = 0
    accept_distance = [0]
    rate = [0]
    rate_p = [0]
    step = 100
    nadapt = 1000

    sigma_prev = sigma_est[-1]
    c_prev = c_est[-1]
    t_prev = t_est[-1]
    tau_prev = tau_est[-1]
    w_prev = w_est[-1]
    w0_prev = w0_est[-1]
    beta_prev = beta_est[-1]
    n_prev = n_est[-1]
    if gamma != 0: x_prev = x_est[-1]
    p_ij_prev = p_ij_est[-1]
    u_prev = u_est[-1]
    z_prev = z_est[-1]

    p = adj.multiply(p_ij_est[-1])
    nlogp = coo_matrix.sum(n_est[-1].multiply(
        p._with_data(np.log(p.data), copy=True)))
    nlogw = sum(sum_n * np.log(w_est[-1]))
    wpw = sum(w_est[-1] * np.dot(p_ij_est[-1], w_est[-1]))
    uw0 = sum((u_est[-1] - 1) * np.log(w0_est[-1]))
    sumw0 = sum(np.log(w0_est[-1]))

    for i in range(iter):

        # update hyperparameters if at least one of them demands the update
        if sigma is True or c is True or t is True or tau is True:
            output_params = up.update_params(prior,
                                             sigma_prev,
                                             c_prev,
                                             t_prev,
                                             tau_prev,
                                             z_prev,
                                             w0_prev,
                                             beta_prev,
                                             u_prev,
                                             log_post_param_est[-1],
                                             accept_params[-1],
                                             sigma=sigma,
                                             c=c,
                                             t=t,
                                             tau=tau,
                                             sigma_sigma=sigma_sigma,
                                             sigma_c=sigma_c,
                                             sigma_t=sigma_t,
                                             sigma_tau=sigma_tau,
                                             a_t=a_t,
                                             b_t=b_t)
            sigma_prev = output_params[0]
            c_prev = output_params[1]
            t_prev = output_params[2]
            tau_prev = output_params[3]
            z_prev = output_params[4]
            accept_params.append(output_params[5])
            log_post_param_est.append(output_params[6])
            rate_p.append(output_params[7])
            if (i + 1) % save_every == 0 and i != 0:
                sigma_est.append(sigma_prev)
                c_est.append(c_prev)
                t_est.append(t_prev)
                tau_est.append(tau_prev)
                z_est.append(z_prev)
            if i % 1000 == 0:
                print('update hyperparams iteration ', i)
                print('acceptance rate hyperparams = ',
                      round(accept_params[-1] / (i + 1) * 100, 1), '%')
            if (i % step) == 0 and i != 0 and i < nburn:
                if sigma is True:
                    sigma_sigma = aux.tune(accept_params, sigma_sigma, step)
                if c is True:
                    sigma_c = aux.tune(accept_params, sigma_c, step)
                if t is True:
                    sigma_t = aux.tune(accept_params, sigma_t, step)
                if tau is True:
                    sigma_tau = aux.tune(accept_params, sigma_tau, step)

        # update w and beta if at least one of them is True
        if w0 is True:
            if accept_params[-1] == 0:
                log_post_est.append(log_post_est[-1])
            if accept_params[-1] == 1:
                temp = aux.log_post_logwbeta_params(
                    prior,
                    sigma_prev,
                    c_prev,
                    t_prev,
                    tau_prev,
                    w_prev,
                    w0_prev,
                    beta_prev,
                    n_prev,
                    u_prev,
                    p_ij_prev,
                    a_t,
                    b_t,
                    gamma,
                    sum_n,
                    adj,
                    log_post_par=log_post_param_est[-1],
                    nlogp=nlogp,
                    nlogw=nlogw,
                    wpw=wpw,
                    uw0=uw0,
                    sumw0=sumw0)
                log_post_est.append(temp[0])
            if w_inference == 'gibbs':
                output_gibbs = up.gibbs_w(w_prev, beta_prev, sigma_prev,
                                          c_prev, z_prev, u_prev, n_prev,
                                          p_ij_prev, gamma, sum_n)
                w_prev = output_gibbs[0]
                w0_prev = output_gibbs[1]
                log_post_param_est.append(
                    aux.log_post_params(prior, sigma_prev, c_prev, t_prev,
                                        tau_prev, w0_prev, beta_prev, u_prev,
                                        a_t, b_t))
                temp = aux.log_post_logwbeta_params(
                    prior,
                    sigma_prev,
                    c_prev,
                    t_prev,
                    tau_prev,
                    w_prev,
                    w0_prev,
                    beta_prev,
                    n_prev,
                    u_prev,
                    p_ij_prev,
                    a_t,
                    b_t,
                    gamma,
                    sum_n,
                    adj,
                    log_post=log_post_param_est[-1],
                    nlogp=nlogp)
                log_post_est.append(temp[0])
                ##
                nlogw = temp[2]
                wpw = temp[3]
                uw0 = temp[4]
                sumw0 = temp[5]
                ##
                if i % 1000 == 0 and i != 0:
                    print('update w (gibbs) iteration ', i)
            if w_inference == 'HMC':
                output_hmc = up.HMC_w(prior,
                                      w_prev,
                                      w0_prev,
                                      beta_prev,
                                      n_prev,
                                      u_prev,
                                      sigma_prev,
                                      c_prev,
                                      t_prev,
                                      tau_prev,
                                      z_prev,
                                      gamma,
                                      p_ij_prev,
                                      a_t,
                                      b_t,
                                      epsilon,
                                      R,
                                      accept_hmc,
                                      size,
                                      sum_n,
                                      adj,
                                      log_post_est[-1],
                                      log_post_param_est[-1],
                                      nlogp,
                                      nlogw,
                                      wpw,
                                      uw0,
                                      sumw0,
                                      update_beta=beta)
                w_prev = output_hmc[0]
                w0_prev = output_hmc[1]
                beta_prev = output_hmc[2]
                accept_hmc = output_hmc[3]
                rate.append(output_hmc[4])
                log_post_est.append(output_hmc[5])
                log_post_param_est.append(output_hmc[6])
                ##
                nlogw = output_hmc[7]
                wpw = output_hmc[8]
                uw0 = output_hmc[9]
                sumw0 = output_hmc[10]
                ##
                if i % 100 == 0 and i != 0:
                    # if i < nadapt:
                    if i >= step:
                        # epsilon = np.exp(np.log(epsilon) + 0.01 * (np.mean(rate) - 0.6))
                        epsilon = np.exp(
                            np.log(epsilon) + 0.01 *
                            (np.mean(rate[i - step:i]) - 0.6))
                if i % 1000 == 0:
                    print('update w and beta iteration ', i)
                    print('acceptance rate HMC = ',
                          round(accept_hmc / (i + 1) * 100, 1), '%')
                    print('epsilon = ', epsilon)
            if (i + 1) % save_every == 0 and i != 0:
                w_est.append(w_prev)
                w0_est.append(w0_prev)
                beta_est.append(beta_prev)

        # update n
        step_n = 1
        if n is True and (i + 1) % step_n == 0:
            n_prev = up.update_n(w_prev, G, size, p_ij_prev, ind, selfedge)
            sum_n = np.array(
                csr_matrix.sum(n_prev, axis=0) +
                np.transpose(csr_matrix.sum(n_prev, axis=1)))[0]
            log_post_param_est.append(log_post_param_est[-1])
            temp = aux.log_post_logwbeta_params(
                prior,
                sigma_prev,
                c_prev,
                t_prev,
                tau_prev,
                w_prev,
                w0_prev,
                beta_prev,
                n_prev,
                u_prev,
                p_ij_prev,
                a_t,
                b_t,
                gamma,
                sum_n,
                adj,
                log_post_par=log_post_param_est[-1],
                wpw=wpw,
                uw0=uw0,
                sumw0=sumw0)
            log_post_est.append(temp[0])
            ##
            nlogp = temp[1]
            nlogw = temp[2]
            ##

            if (i + 1) % save_every == 0 and i != 0:
                n_est.append(n_prev)
            if i % 1000 == 0:
                print('update n iteration ', i)

        # update u
        if u is True:
            u_prev = up.posterior_u(z_prev * w0_prev)
            log_post_param_est.append(
                aux.log_post_params(prior, sigma_prev, c_prev, t_prev,
                                    tau_prev, w0_prev, beta_prev, u_prev, a_t,
                                    b_t))
            temp = aux.log_post_logwbeta_params(
                prior,
                sigma_prev,
                c_prev,
                t_prev,
                tau_prev,
                w_prev,
                w0_prev,
                beta_prev,
                n_prev,
                u_prev,
                p_ij_prev,
                a_t,
                b_t,
                gamma,
                sum_n,
                adj,
                log_post_par=log_post_param_est[-1],
                nlogp=nlogp,
                nlogw=nlogw,
                wpw=wpw,
                sumw0=sumw0)
            log_post_est.append(temp[0])
            ##
            uw0 = temp[4]
            ##
            if (i + 1) % save_every == 0 and i != 0:
                u_est.append(u_prev)
            if i % 1000 == 0:
                print('update u iteration ', i)

        step_x = 1
        if x is True and (i + 1) % step_x == 0:
            out_x = up.update_x(x_prev, w_prev, gamma, p_ij_prev, n_prev,
                                sigma_x, accept_distance[-1], prior,
                                sigma_prev, c_prev, t_prev, tau_prev, w0_prev,
                                beta_prev, u_prev, a_t, b_t, sum_n, adj,
                                log_post_est[-1], log_post_param_est[-1],
                                index, nlogw, uw0, sumw0, nlogp, wpw)
            x_prev = out_x[0]
            p_ij_prev = out_x[1]
            accept_distance.append(out_x[2])
            log_post_est.append(out_x[3])
            ##
            nlogp = out_x[4]
            wpw = out_x[5]
            ##
            if (i + 1) % save_every == 0 and i != 0:
                p_ij_est.append(p_ij_prev)
                x_est.append(x_prev)
            if i % 1000 == 0:
                print('update x iteration ', i)
                print('acceptance rate x = ',
                      round(accept_distance[-1] * 100 * step_x / iter, 1), '%')
                print('sigma_x = ', sigma_x)
            if (i % (step / step_x)) == 0 and i != 0 and i < nburn:
                sigma_x = aux.tune(accept_distance, sigma_x,
                                   int(step / step_x))

    if gamma != 0:
        return w_est, w0_est, beta_est, sigma_est, c_est, t_est, tau_est, n_est, u_est, \
                log_post_param_est, log_post_est, p_ij_est, x_est
    else:
        return w_est, w0_est, beta_est, sigma_est, c_est, t_est, tau_est, n_est, u_est, \
                log_post_param_est, log_post_est, p_ij_est
Exemplo n.º 24
0
def main(nelx, nely, nelz, volfrac, penal, rmin, heaviside):
    # USER DEFINED PRINT ORIENTATION
    baseplate = 'S'
    # USER DEFINED LOOP PARAMETERS
    maxloop = 1000
    tolx = 0.01
    displayflag = 0
    # USER DEFINED MATERIAL PROPERTIES
    E0 = 1
    Emin = 1e-9
    nu = 0.3
    # USER DEFINED LOAD DoFs
    il, jl, kl = np.meshgrid(nelx, 0, np.arange(nelz + 1))
    loadnid = kl * (nelx + 1) * (nely + 1) + il * (nely + 1) + (nely + 1 - jl)
    loaddof = 3 * np.ravel(loadnid, order='F') - 1  #CURRENTLY A 1D ARRAY (used for sparse later)
    # USER DEFINED SUPPORT FIXED DOFS
    iif, jf, kf = np.meshgrid(0, np.arange(nely + 1), np.arange(nelz + 1))
    fixednid = kf * (nelx + 1) * (nely + 1) + iif * (nely + 1) + (nely + 1 - jf)
    fixeddof = np.concatenate((3 * np.ravel(fixednid, order='F'), 3*np.ravel(fixednid, order='F')-1,
                        3*np.ravel(fixednid, order='F') - 2)) #CURRENTLY A 1D ARRAY (used for sparse later)

    # PREPARE FE ANALYSIS
    nele = nelx * nely * nelz
    ndof = 3 * (nelx + 1) * (nely + 1) * (nelz + 1)
    F = csr_matrix((-1 * np.ones(np.shape(loaddof)), (loaddof-1, np.ones(np.shape(loaddof))-1)),
                    shape=(ndof, 1))
    U = np.zeros((ndof, 1))
    freedofs = np.setdiff1d(np.arange(ndof) + 1, fixeddof)
    KE = lk_H8(nu)
    nodegrd = np.reshape(np.arange((nely + 1) * (nelx + 1)) + 1, (nely + 1, nelx + 1), order = 'F')
    nodeids = np.reshape(nodegrd[0:-1, 0:-1], (nely * nelx, 1), order='F')
    nodeidz = np.arange(0, (nelz - 1) * (nely + 1) * (nelx + 1) + 1, (nely + 1) * (nelx + 1))[np.newaxis]
    nodeids = (np.matlib.repmat(nodeids, np.shape(nodeidz)[0], np.shape(nodeidz)[1])
                    + np.matlib.repmat(nodeidz, np.shape(nodeids)[0], np.shape(nodeids)[1]))
    edofVec = (3 * np.ravel(nodeids, order='F') + 1)[np.newaxis]
    edofMat = (np.matlib.repmat(edofVec.T, 1, 24) +
                np.matlib.repmat(np.concatenate((
                    np.array([0, 1, 2]), 3*nely + np.array([3, 4, 5, 0, 1, 2]), np.array([-3, -2, -1]),
                    3*(nely + 1)*(nelx + 1) + np.concatenate((
                        np.array([0, 1, 2]), 3*nely+np.array([3, 4, 5, 0, 1, 2]), np.array([-3, -2, -1])
                        ))
                    )), nele, 1))
    iK = np.reshape(np.kron(edofMat, np.ones((24, 1))).T, (24 * 24 * nele, 1), order='F')
    jK = np.reshape(np.kron(edofMat, np.ones((1, 24))).T, (24 * 24 * nele, 1), order='F')

    # PREPARE FILTER
    iH = np.ones((int(nele * (2 * (np.ceil(rmin) - 1) + 1)** 2), 1))
    iHdummy = []
    jH = np.ones(np.shape(iH))
    jHdummy = []
    sH = np.zeros(np.shape(iH))
    sHdummy = []
    k = 0
    #####################
    for k1 in np.arange(nelz)+1:
        for i1 in np.arange(nelx)+1:
            for j1 in np.arange(nely)+1:
                e1 = (k1 - 1) * nelx * nely + (i1 - 1) * nely + j1
                for k2 in np.arange(max(k1 - (np.ceil(rmin) - 1), 1), min(k1 + (np.ceil(rmin) - 1), nelz) + 1):
                    for i2 in np.arange(max(i1 - (np.ceil(rmin) - 1), 1), min(i1 + (np.ceil(rmin) - 1), nelx) + 1):
                        for j2 in np.arange(max(j1 - (np.ceil(rmin) - 1), 1), min(j1 + (np.ceil(rmin) - 1), nely) + 1):
                            e2 = (k2 - 1) * nelx * nely + (i2 - 1) * nely + j2
                            if k < np.size(iH):
                                iH[k] = e1
                                jH[k] = e2
                                sH[k] = max(0, rmin - np.sqrt((i1 - i2)** 2 + (j1 - j2)** 2 + (k1 - k2)** 2))
                            else:
                                iHdummy.append(e1)
                                jHdummy.append(e2)
                                sHdummy.append(max(0, rmin - np.sqrt((i1 - i2)** 2 + (j1 - j2)** 2 + (k1 - k2)** 2)))
                            k = k + 1
    #####################
    iH = np.concatenate((iH, np.array(iHdummy).reshape((len(iHdummy), 1))))
    jH = np.concatenate((jH, np.array(jHdummy).reshape((len(jHdummy), 1))))
    sH = np.concatenate((sH, np.array(sHdummy).reshape((len(sHdummy), 1))))

    H = csr_matrix((np.squeeze(sH), (np.squeeze(iH.astype(int)) - 1, np.squeeze(jH.astype(int)) - 1)))
    Hs = csr_matrix.sum(H, axis=0).T

    if heaviside == 0:
        # INITIALIZE ITERATION
        x = np.tile(volfrac, [nelz, nely, nelx])
        xPhys = x
        ######## AMFILTER CALL TYPE 1 #########
        xPrint, _ = AMFilter3D.AMFilter(xPhys, baseplate)
        ##################################
        loop = 0
        change = 1
        # START ITERATION
        while change > tolx and loop < maxloop:
            loop = loop + 1
            # FE ANALYSIS
            sK = np.reshape(np.ravel(KE, order='F')[np.newaxis].T @ (Emin+xPrint.transpose(0,2,1).ravel(order='C')[np.newaxis]**penal*(E0-Emin)),(24*24*nele,1),order='F')
            K = csr_matrix((np.squeeze(sK), (np.squeeze(iK.astype(int)) - 1, np.squeeze(jK.astype(int)) - 1)))
            K = (K + K.T) / 2
            U[freedofs - 1,:] = spsolve(K[freedofs - 1,:][:, freedofs - 1], F[freedofs - 1,:])[np.newaxis].T 

            # OBJECTIVE FUNCTION AND SENSITIVITY ANALYSIS
            ce = np.reshape(np.sum((U[edofMat - 1].squeeze() @ KE) * U[edofMat - 1].squeeze(), axis=1), (nelz, nelx, nely), order = 'C').transpose(0,2,1)
            c = np.sum(np.sum(np.sum(Emin + xPrint ** penal * (E0 - Emin) * ce)))  # REPLACE xPhys with xPrint
            dc = -penal * (E0 - Emin) * (xPrint ** (penal - 1)) * ce               # REPLACE xPhys with xPrint
            dv = np.ones((nelz, nely, nelx))
            ######### AMFILTER CALL TYPE 2 #########
            xPrint, senS = AMFilter3D.AMFilter(xPhys, baseplate, dc, dv)
            dc = senS[0]
            dv = senS[1]
            ###################################
            # FILTERING AND MODIFICATION OF SENSITIVITIES
            dc = np.array((H @ (dc.transpose(0,2,1).ravel(order='C')[np.newaxis].T/Hs))).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1)
            dv = np.array((H @ (dv.transpose(0,2,1).ravel(order='C')[np.newaxis].T/Hs))).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1)

            # OPTIMALITY CRITERIA UPDATE
            l1 = 0
            l2 = 1e9
            move = 0.05
            while (l2 - l1) / (l1 + l2) > 1e-3 and l2>1e-9:
                lmid = 0.5 * (l2 + l1)
                xnew_step1 = np.minimum(x + move, x * np.sqrt(-dc / dv / lmid))
                xnew_step2 = np.minimum(1, xnew_step1)
                xnew_step3 = np.maximum(x - move, xnew_step2)
                xnew = np.maximum(0, xnew_step3)
                xPhys = np.array((H @ (xnew.transpose(0,2,1).ravel(order='C')[np.newaxis].T)/Hs)).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1)
                ######### AMFILTER CALL TYPE 1 ######
                xPrint, _ = AMFilter3D.AMFilter(xPhys, baseplate)
                #################################
                if np.sum(xPrint.ravel(order='C')) > volfrac * nele:  # REPLACE xPhys with xPrint
                    l1 = lmid
                else:
                    l2 = lmid
            change = np.max(np.absolute(np.ravel(xnew, order='F') - np.ravel(x, order='F')))
            x = xnew
            print("it.: {0} , ch.: {1:.3f}, obj.: {2:.4f}, Vol.: {3:.3f}".format(
                loop, change, c, np.mean(xPrint.ravel(order='C'))))
    elif heaviside == 1:
        beta = 1
        # INITIALIZE ITERATION
        x = np.tile(volfrac, [nelz, nely, nelx])
        xTilde = x
        xPhys = 1 - np.exp(-beta * xTilde) + xTilde * np.exp(-beta)
        ######## AMFILTER CALL TYPE 1 #########
        xPrint, _ = AMFilter3D.AMFilter(xPhys, baseplate)
        ##################################
        loop = 0
        loopbeta = 0
        change = 1

        # START ITERATION
        while change > tolx and loop < maxloop:
            loop = loop + 1
            loopbeta = loopbeta + 1
            
            # FE ANALYSIS
            sK = np.reshape(np.ravel(KE, order='F')[np.newaxis].T @ (Emin+xPrint.transpose(0,2,1).ravel(order='C')[np.newaxis]**penal*(E0-Emin)),(24*24*nele,1),order='F')
            K = csr_matrix((np.squeeze(sK), (np.squeeze(iK.astype(int)) - 1, np.squeeze(jK.astype(int)) - 1)))
            K = (K + K.T) / 2
            U[freedofs - 1,:] = spsolve(K[freedofs - 1,:][:, freedofs - 1], F[freedofs - 1,:])[np.newaxis].T 

            # OBJECTIVE FUNCTION AND SENSITIVITY ANALYSIS
            ce = np.reshape(np.sum((U[edofMat - 1].squeeze() @ KE) * U[edofMat - 1].squeeze(), axis=1), (nelz, nelx, nely), order = 'C').transpose(0,2,1)
            c = np.sum(np.sum(np.sum(Emin + xPrint ** penal * (E0 - Emin) * ce)))  # REPLACE xPhys with xPrint
            dc = -penal * (E0 - Emin) * (xPrint ** (penal - 1)) * ce               # REPLACE xPhys with xPrint
            dv = np.ones((nelz, nely, nelx))
            ######### AMFILTER CALL TYPE 2 #########
            xPrint, senS = AMFilter3D.AMFilter(xPhys, baseplate, dc, dv)
            dc = senS[0]
            dv = senS[1]
            ###################################
            # FILTERING AND MODIFICATION OF SENSITIVITIES
            dx = beta * np.exp(-beta * xTilde) + np.exp(-beta)
            dc = np.array((H @ (dc.transpose(0, 2, 1).ravel(order='C')[np.newaxis].T *
                                dx.transpose(0, 2, 1).ravel(order='C')[np.newaxis].T
                                /Hs))).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1)
            dv = np.array((H @ (dv.transpose(0, 2, 1).ravel(order='C')[np.newaxis].T *
                                dx.transpose(0, 2, 1).ravel(order='C')[np.newaxis].T
                                /Hs))).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1)

            # OPTIMALITY CRITERIA UPDATE
            l1 = 0
            l2 = 1e9
            move = 0.05
            while (l2 - l1) / (l1 + l2) > 1e-3:
                lmid = 0.5 * (l2 + l1)
                xnew_step1 = np.minimum(x + move, x * np.sqrt(-dc / dv / lmid))
                xnew_step2 = np.minimum(1, xnew_step1)
                xnew_step3 = np.maximum(x - move, xnew_step2)
                xnew = np.maximum(0, xnew_step3)
                xTilde = np.array((H @ (xnew.transpose(0,2,1).ravel(order='C')[np.newaxis].T)/Hs)).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1)
                xPhys = 1 - np.exp(-beta * xTilde) + xTilde * np.exp(-beta)
                ######### AMFILTER CALL TYPE 1 ######
                xPrint, _ = AMFilter3D.AMFilter(xPhys, baseplate)
                #################################
                if np.sum(xPrint.ravel(order='C')) > volfrac * nele:  # REPLACE xPhys with xPrint
                    l1 = lmid
                else:
                    l2 = lmid
            change = np.max(np.absolute(np.ravel(xnew, order='F') - np.ravel(x, order='F')))
            x = xnew
            if beta < 512 and (loopbeta >= 50 or change <= 0.01):
                beta = 2 * beta
                loopbeta = 0
                change = 1
                print("Parameter beta increased to {0}. \n".format(beta))
                
            print("it.: {0} , ch.: {1:.3f}, obj.: {2:.4f}, Vol.: {3:.3f}".format(
                loop, change, c, np.mean(xPrint.ravel(order='C'))))

    return xPrint
    df.index = titles
    a = df.sum(axis=1)
    mask2 = a>0
    return df.loc[mask2,:]

def create_sparse_matrix(list_of_followers):
    list_of_strings = []
    list_of_titles = []
    for ls in list_of_followers:
        x = ' '.join([ str(x) for x in ls['TwitterFollowers']])
        list_of_strings.append(x)
        list_of_titles.append(ls['Title'])
    return list_of_strings, list_of_titles


strings,titles = create_sparse_matrix(practice)
final = CV.fit_transform(strings)

a = csr_matrix.sum(final, axis=0)
a = np.array(a).reshape(a.shape[1],)
sparse = final.toarray()
mask = a>1
u=np.where(mask)[0]
sparse_mat = final.tocsc()[:,u]
df = sparse_mat.toarray()
attempt = get_truncated_matrix(mat, 25000)
df = pd.DataFrame(attempt)
item_matrix=gl.SFrame(df)

item_matrix.save("IHavebeensaved.gl")
Exemplo n.º 26
0
def GraphSampler(prior,
                 approximation,
                 typesampler,
                 sigma,
                 c,
                 t,
                 tau,
                 gamma,
                 size_x,
                 type_prior_x,
                 dim_x,
                 a_t=200,
                 b_t=1,
                 print_=True,
                 **kwargs):

    start = time.time()
    # sample weights w, w0, beta
    output = weight.WeightsSampler(prior, approximation, t, sigma, c, tau,
                                   **kwargs)
    w = kwargs['w'] if 'w' in kwargs else output[0]
    w0 = kwargs['w0'] if 'w0' in kwargs else output[1]
    beta = kwargs['beta'] if 'beta' in kwargs else output[2]
    size = len(w)
    # sample locations
    x = kwargs['x'] if 'x' in kwargs else loc.LocationsSampler(
        size_x, size, type_prior_x, dim_x)
    # sample graph
    if typesampler == "naive":
        [G, w, x, size] = NaiveSampler(w, x, gamma, dim_x)
    if typesampler == "layers":
        K = kwargs['K'] if 'K' in kwargs else 100
        [G, w, x, size] = SamplerLayers_optim(w, x, gamma, size_x, K)
    end = time.time()

    deg = np.array(list(dict(G.degree()).values()))
    if print_ is True:
        print('time to produce sample: ', round((end - start) / 60, 2), ' min')
        print('number of active nodes: ', sum(deg > 0))
        print('total number of nodes L: ', len(deg))

    G.graph['prior'] = prior
    G.graph['sigma'] = sigma
    G.graph['c'] = c
    G.graph['t'] = t
    G.graph['tau'] = tau
    G.graph['gamma'] = gamma
    G.graph['size_x'] = size_x
    G.graph['a_t'] = a_t
    G.graph['b_t'] = b_t

    # set nodes attributes: w, w0, beta, x, u
    z = (size * sigma / t) ** (1 / sigma) if prior == 'singlepl' else \
        (size * tau * sigma ** 2 / (t * c ** (sigma * (tau - 1)))) ** (1 / sigma)
    G.graph['z'] = z
    u = tp.tpoissrnd(z * w0)
    d = {k: [] for k in G.nodes}
    for i in G.nodes():
        d[i] = {'w': w[i], 'w0': w0[i], 'beta': beta[i], 'x': x[i], 'u': u[i]}
    nx.set_node_attributes(G, d)

    # set graph attributes: ind (upper triangular matrix of neighbors of nodes) and selfedge (list of nodes w/ selfedge)
    ind = {k: [] for k in G.nodes}
    for i in G.nodes:
        for j in G.adj[i]:
            if j >= i:
                ind[i].append(j)
    selfedge = [i in ind[i] for i in G.nodes]
    selfedge = list(compress(G.nodes, selfedge))
    G.graph['ind'] = ind
    G.graph['selfedge'] = selfedge

    # computing "distance" matrix p_ij = 1 / ((1 + |x_i-x_j|) ** gamma)
    p_ij = aux.space_distance(x, gamma) if gamma != 0 else np.ones(
        (size, size))
    G.graph['distances'] = p_ij

    # computing counts upper triangular matrix n
    n_out = up.update_n(w, G, size, p_ij, ind, selfedge)
    n = n_out[0]
    G.graph[
        'counts'] = n  # for the counts, it would be nice to set up a nx.MultiGraph, but some algorithms don't work
    #  on these graphs, so for the moment I'll assign n as attribute to the whole graph rather then the single nodes
    sum_n = np.array(
        csr_matrix.sum(n, axis=0) + np.transpose(csr_matrix.sum(n, axis=1)))[0]
    G.graph['sum_n'] = sum_n
    sum_fact_n = n_out[1]
    G.graph['sum_fact_n'] = sum_fact_n

    #  attach log posterior of the graph as attribute
    adj = n > 0

    # ### SPEED UP - when updating x alone
    # ind = np.argsort(deg)
    # index = ind[0:len(ind) - 1]
    # log_post = aux.log_post_logwbeta_params(prior, sigma, c, t, tau, w, w0, beta, n, u, p_ij, a_t, b_t, gamma, sum_n,
    #                                         adj, x, index=index)
    # ### SPEED UP - when updating x alone
    log_post_param = aux.log_post_params(prior, sigma, c, t, tau, w0, beta, u,
                                         a_t, b_t)
    log_post = aux.log_post_logwbeta_params(prior, sigma, c, t, tau, w, w0,
                                            beta, n, u, p_ij, a_t, b_t, gamma,
                                            sum_n, adj, x)
    G.graph['log_post'] = log_post
    G.graph['log_post_param'] = log_post_param

    return G
Exemplo n.º 27
0
def mcmc(G,
         iter,
         nburn,
         w0=False,
         beta=False,
         n=False,
         u=False,
         sigma=False,
         c=False,
         t=False,
         tau=False,
         x=False,
         hyperparams=False,
         wnu=False,
         all=False,
         sigma_sigma=0.01,
         sigma_c=0.01,
         sigma_t=0.01,
         sigma_tau=0.01,
         sigma_x=0.01,
         a_t=200,
         b_t=1,
         epsilon=0.01,
         R=5,
         w_inference='HMC',
         save_every=1000,
         init='none',
         index=None,
         type_prop_x='tNormal'):

    size = G.number_of_nodes()
    prior = G.graph['prior'] if 'prior' in G.graph else print(
        'You must specify a prior as attribute of G')
    gamma = G.graph['gamma'] if 'gamma' in G.graph else print(
        'You must specify spatial exponent gamma as G attribute')
    size_x = G.graph['size_x'] if 'size_x' in G.graph else print(
        'You must specify size_x as attribute of G')

    sigma_est, c_est, t_est, tau_est, w_est, w0_est, beta_est, n_est, x_est, p_ij_est, u_est, z_est, ind, selfedge = \
     init_var(G, size, gamma, init, w0, beta, n, u, sigma, c, t, tau, x, hyperparams, wnu, all, prior, a_t, b_t, size_x)

    accept_params = [0]
    accept_hmc = 0
    accept_distance = [0]
    rate = [0]
    rate_p = [0]
    step = 100
    nadapt = 1000

    sigma_prev = sigma_est[-1]
    c_prev = c_est[-1]
    t_prev = t_est[-1]
    tau_prev = tau_est[-1]
    w_prev = w_est[-1]
    w0_prev = w0_est[-1]
    beta_prev = beta_est[-1]
    n_prev = n_est[-1]
    x_prev = x_est[-1]
    p_ij_prev = p_ij_est[-1]
    u_prev = u_est[-1]
    z_prev = z_est[-1]
    sum_n = np.array(
        csr_matrix.sum(n_prev, axis=0) +
        np.transpose(csr_matrix.sum(n_prev, axis=1)))[0]
    adj = n_prev > 0
    log_post_param_prev = aux.log_post_params(prior, sigma_prev, c_prev,
                                              t_prev, tau_prev, w0_prev,
                                              beta_prev, u_prev, a_t, b_t)
    log_post_prev = aux.log_post_logwbeta_params(
        prior,
        sigma_prev,
        c_prev,
        t_prev,
        tau_prev,
        w_prev,
        w0_prev,
        beta_prev,
        n_prev,
        u_prev,
        p_ij_prev,
        a_t,
        b_t,
        gamma,
        sum_n,
        adj,
        x_prev,
        log_post_par=log_post_param_prev)

    log_post_param_est = [log_post_param_prev]
    log_post_est = [log_post_prev]

    for i in range(iter):

        # update hyperparameters if at least one of them demands the update
        if sigma is True or c is True or t is True or tau is True:
            sigma_prev, c_prev, t_prev, tau_prev, z_prev, accept_param_prev, log_post_param_prev, rate_p_prev \
                = up.update_params(prior, sigma_prev, c_prev, t_prev, tau_prev, z_prev,
                                   w0_prev, beta_prev, u_prev, log_post_param_prev, accept_params[-1],
                                   sigma=sigma, c=c, t=t, tau=tau,
                                   sigma_sigma=sigma_sigma, sigma_c=sigma_c, sigma_t=sigma_t,
                                   sigma_tau=sigma_tau, a_t=a_t, b_t=b_t)
            accept_params.append(accept_param_prev)
            rate_p.append(rate_p_prev)
            # if you only have to update hyperparams, then log_post = log_post_param, otherwise you need to update that
            if w0 is True or n is True or u is True or x is True:
                log_post_prev = aux.log_post_logwbeta_params(
                    prior,
                    sigma_prev,
                    c_prev,
                    t_prev,
                    tau_prev,
                    w_prev,
                    w0_prev,
                    beta_prev,
                    n_prev,
                    u_prev,
                    p_ij_prev,
                    a_t,
                    b_t,
                    gamma,
                    sum_n,
                    adj,
                    x_prev,
                    log_post_par=log_post_param_prev)
            if (i + 1) % save_every == 0 and i != 0:
                sigma_est.append(sigma_prev)
                c_est.append(c_prev)
                t_est.append(t_prev)
                tau_est.append(tau_prev)
                z_est.append(z_prev)
                log_post_param_est.append(log_post_param_prev)
                if w0 is True or n is True or u is True or x is True:
                    log_post_est.append(log_post_prev)
            if i % 1000 == 0:
                print('update hyperparams iteration ', i)
                print('acceptance rate hyperparams = ',
                      round(accept_params[-1] / (i + 1) * 100, 1), '%')
            if (i % step) == 0 and i != 0 and i < nburn:
                if sigma is True:
                    sigma_sigma = aux.tune(accept_params, sigma_sigma, step)
                if c is True:
                    sigma_c = aux.tune(accept_params, sigma_c, step)
                if t is True:
                    sigma_t = aux.tune(accept_params, sigma_t, step)
                if tau is True:
                    sigma_tau = aux.tune(accept_params, sigma_tau, step)

        # update w and beta if at least one of them is True
        if w0 is True:

            if w_inference == 'gibbs':
                w_prev, w0_prev = up.gibbs_w(w_prev, beta_prev, sigma_prev,
                                             c_prev, z_prev, u_prev, n_prev,
                                             p_ij_prev, gamma, sum_n)
                log_post_param_prev = aux.log_post_params(
                    prior, sigma_prev, c_prev, t_prev, tau_prev, w0_prev,
                    beta_prev, u_prev, a_t, b_t)
                log_post_prev = aux.log_post_logwbeta_params(
                    prior,
                    sigma_prev,
                    c_prev,
                    t_prev,
                    tau_prev,
                    w_prev,
                    w0_prev,
                    beta_prev,
                    n_prev,
                    u_prev,
                    p_ij_prev,
                    a_t,
                    b_t,
                    gamma,
                    sum_n,
                    adj,
                    x_prev,
                    log_post=log_post_param_prev)
                if (i + 1) % save_every == 0 and i != 0:
                    w_est.append(w_prev)
                    w0_est.append(w0_prev)
                    beta_est.append(beta_prev)
                    log_post_est.append(log_post_prev)
                    log_post_param_est.append(log_post_param_prev)
                if i % 1000 == 0 and i != 0:
                    print('update w iteration ', i)
            if w_inference == 'HMC':
                w_prev, w0_prev, beta_prev, accept_hmc, rate_prev, log_post_prev, log_post_param_prev \
                            = up.HMC_w(prior, w_prev, w0_prev, beta_prev, n_prev, u_prev,
                                       sigma_prev, c_prev, t_prev, tau_prev, z_prev, gamma,
                                       p_ij_prev, a_t, b_t, epsilon, R, accept_hmc, size, sum_n, adj, x_prev,
                                       log_post_prev, log_post_param_prev, update_beta=beta)
                rate.append(rate_prev)
                if (i + 1) % save_every == 0 and i != 0:
                    w_est.append(w_prev)
                    w0_est.append(w0_prev)
                    beta_est.append(beta_prev)
                    log_post_est.append(log_post_prev)
                    log_post_param_est.append(log_post_param_prev)
                if i % 100 == 0 and i != 0:
                    # if i < nadapt:
                    if i >= step:
                        # epsilon = np.exp(np.log(epsilon) + 0.01 * (np.mean(rate) - 0.6))
                        epsilon = np.exp(
                            np.log(epsilon) + 0.01 *
                            (np.mean(rate[i - step:i]) - 0.6))
                if i % 1000 == 0:
                    print('update w and beta iteration ', i)
                    print('acceptance rate HMC = ',
                          round(accept_hmc / (i + 1) * 100, 1), '%')
                    print('epsilon = ', epsilon)

        # update n
        step_n = 25
        if n is True and (i + 1) % step_n == 0:
            n_prev, rubbish = up.update_n(w_prev, G, size, p_ij_prev, ind,
                                          selfedge)
            sum_n = np.array(
                csr_matrix.sum(n_prev, axis=0) +
                np.transpose(csr_matrix.sum(n_prev, axis=1)))[0]
            log_post_prev = aux.log_post_logwbeta_params(
                prior,
                sigma_prev,
                c_prev,
                t_prev,
                tau_prev,
                w_prev,
                w0_prev,
                beta_prev,
                n_prev,
                u_prev,
                p_ij_prev,
                a_t,
                b_t,
                gamma,
                sum_n,
                adj,
                x_prev,
                log_post_par=log_post_param_prev)
            if (i + 1) % save_every == 0 and i != 0:
                n_est.append(n_prev)
                log_post_param_est.append(log_post_param_prev)
                log_post_est.append(log_post_prev)
            if i % 1000 == 0:
                print('update n iteration ', i)

        # update u
        if u is True:
            u_prev = up.posterior_u(z_prev * w0_prev)
            log_post_param_prev = aux.log_post_params(prior, sigma_prev,
                                                      c_prev, t_prev, tau_prev,
                                                      w0_prev, beta_prev,
                                                      u_prev, a_t, b_t)
            log_post_prev = aux.log_post_logwbeta_params(
                prior,
                sigma_prev,
                c_prev,
                t_prev,
                tau_prev,
                w_prev,
                w0_prev,
                beta_prev,
                n_prev,
                u_prev,
                p_ij_prev,
                a_t,
                b_t,
                gamma,
                sum_n,
                adj,
                x_prev,
                log_post_par=log_post_param_prev)
            if (i + 1) % save_every == 0 and i != 0:
                u_est.append(u_prev)
                log_post_param_est.append(log_post_param_prev)
                log_post_est.append(log_post_prev)
            if i % 1000 == 0:
                print('update u iteration ', i)

        step_x = 1
        if x is True and (i + 1) % step_x == 0:
            x_prev, p_ij_prev, accept_distance_prev, log_post_prev = \
                up.update_x(x_prev, w_prev, gamma, p_ij_prev, n_prev, sigma_x, accept_distance[-1], prior,
                            sigma_prev, c_prev, t_prev, tau_prev, w0_prev, beta_prev, u_prev, a_t, b_t, sum_n, adj,
                            log_post_prev, log_post_param_prev, index, size_x, type_prop_x)
            accept_distance.append(accept_distance_prev)
            if (i + 1) % save_every == 0 and i != 0:
                p_ij_est.append(p_ij_prev)
                x_est.append(x_prev)
                log_post_param_est.append(log_post_param_prev)
                log_post_est.append(log_post_prev)
            if i % 1000 == 0:
                print('update x iteration ', i)
                print('acceptance rate x = ',
                      round(accept_distance[-1] * 100 * step_x / (i + 1), 1),
                      '%')
                print('sigma_x = ', sigma_x)
            if (i % (step / step_x)) == 0 and i != 0 and i < nburn:
                sigma_x = aux.tune(accept_distance, sigma_x,
                                   int(step / step_x))

    if gamma != 0:
        return w_est, w0_est, beta_est, sigma_est, c_est, t_est, tau_est, n_est, u_est, \
                log_post_param_est, log_post_est, p_ij_est, x_est
    else:
        return w_est, w0_est, beta_est, sigma_est, c_est, t_est, tau_est, n_est, u_est, \
                log_post_param_est, log_post_est, p_ij_est
Exemplo n.º 28
0
class TopicSpecificRank:
	"""Similar to PageRank but the teleport set is a subset(related topics)
	of all nodes.

	...
	
	Parameters
	----------
	
	beta : float
		Probability with which teleports will occur
	
	edges : collections.defaltdict(list)
		Adjacency list containing information connections in web-graph
	
	epsilon : float
		A small value and total error in ranks should be less than epsilon
	
	max_iterations : int
		Maximum number of times to apply power iteration
	
	node_num : int
		Number of nodes in the web-graph
	
	PageRank_vector : numpy.ndarray  [1-dimensional, dtype=float]
		Contains PageRank of each node in the web-graph

	
	order : {'beta', 'edges', 'epsilon', 'max_iterations', 'node_num', 
	'PageRank_vector'}
		Parameters follows precisely the above order.
		None of the parameter is optional.

	
	Methods
	-------
	get_similarTopicPages()
		Classifies topics pages in different classes.

	matrix_get_initailRankMatrix()
		Initailises the topicSpecificRank Matrix.

	matrix_get_topicSpecificGoogleMatrix()
		Creates the Google Matrix which is used in power iteration.

	matrix_get_topicSpecificRank()
		Applies power iteration on Google Matrix and Initial Rank Matrix 
		to get TopicSpecificRank Matrix.

	list_get_topicSpecificRank()
		Alternative method for power iteration which used much less RAM.

	topicSpecificRank()
		Utility function which call other functions and returns rank vector.

	"""
	def __init__(self, beta, edges, epsilon, max_iterations, node_num,
		PageRank_vector):
		self.beta = beta
		self.edges = edges
		self.epsilon = epsilon
		self.node_num = node_num
		self.PageRank_vector = PageRank_vector
		self.MAX_ITERATIONS = max_iterations


	def get_similarTopicPages(self):
		"""Classifies topics pages in different classes.

		[INCOMPLETE] Write your own implementation to classify pages into
		topics.
		
		...
		
		Parameters
		----------
		None
		[May add more if required.]

		
		Returns
		-------
		lol_of_topic_pages : list of list of int
			Each inner list contains the related pages.
			Each page belongs to only one inner list.
			Outer list contains all such inner lists.
		"""
		pass


	def matrix_get_initailRankMatrix(self):
		"""Initailises the topicSpecificRank Matrix.

		
		Parameters
		----------
		None

		
		Returns
		-------
		initial_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), 
			n is `node_num`]
			Ranks are distributed equally among all pages, initially.

		"""
		initial_rank_list = [1/(self.node_num) for i in range(self.node_num)]
		initial_rank_vector = SparseMatrix(np.matrix(initial_rank_list).
			transpose())
		return initial_rank_vector

	
	def matrix_get_topicSpecificGoogleMatrix(self, related_pages):
		"""Creates the Google Matrix which is used in power iteration.

		
		Parameters
		----------
		related_pages : list of int
			Contains list of pages which belong to the same topic.

		
		Returns
		-------
		google_matrix : scipy.sparse.csr_matrix [shape = (n x n), n is 
						`node_num`]
			It contains proportion of rank that will propagate from a 
			page to another page.
			Proportion of rank depends on degree of node and leaked rank.
		"""
		related_set_size = len(related_pages)

		teleport_matrix_row = []
		teleport_matrix_col = []
		teleport_matrix_data = []

		for related_node in related_pages:
			for node in range(self.node_num):
				teleport_matrix_col.append(node)
				teleport_matrix_row.append(related_node)
				teleport_matrix_data.append(
					(1 - self.beta) / related_set_size)
		
		teleport_matrix = SparseMatrix((teleport_matrix_data, (
			teleport_matrix_row, teleport_matrix_col)), shape = (self.node_num,
				self.node_num))

		connection_matrix_row = []
		connection_matrix_col = []
		connection_matrix_data = []

		for parent_node in range(self.node_num):
			for child_node in self.edges[parent_node]:
				connection_matrix_col.append(parent_node)
				connection_matrix_row.append(child_node)
				connection_matrix_data.append(
					self.beta / (len(self.edges[parent_node])))
		
		connection_matrix = SparseMatrix((connection_matrix_data, (
			connection_matrix_row, connection_matrix_col)), shape = (self.
			node_num, self.node_num))

		google_matrix = connection_matrix + teleport_matrix
		return google_matrix	


	def matrix_get_topicSpecificRank(self, teleport_set, initial_rank_vector, 
		google_matrix):
		"""Calculates TopicSpecificRank of each node taking some related_pages 
		as `teleport_set`.

		This method works by applying power iteration until convergence
		or till iterations reach `MAX_ITERATIONS`, whichever happens first.
		
		[USAGE WARNING] : If graph is large, then sparse matrix may become
			huge and use up the entire RAM(which is not a condition to be in).
		...

		Parameters
		----------
		teleport_set : list of int
			List of pages to which a random walker in the web-graph can 
			teleport to.
			In TopicSpecificRank this set corresponds to pages of same topic.

		initial_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), 
			n is `node_num`]
			Ranks are distributed equally among all pages, initially.

		google_matrix : scipy.sparse.csr_matrix [shape = (n x n), n is 
						`node_num`]
			It contains proportion of rank that will propagate from a 
			page to another page.


		Returns
		-------
		final_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), 
			n is `node_num`]
			Contains TopicSpecificRank of each node in the web-graph.

		"""
		iterations = 0
		diff = math.inf
		teleport_set_size = len(teleport_set)
		final_rank_vector = SparseMatrix(np.zeros(self.node_num).transpose())

		while(iterations < self.MAX_ITERATIONS and diff > self.epsilon):
			new_rank_vector = google_matrix * initial_rank_vector

			leaked_rank = (1-SparseMatrix.sum(new_rank_vector))/
				teleport_set_size
			leaked_rank_vector = SparseMatrix(np.array([leaked_rank if node in
				teleport_set else 0 for node in range(self.node_num)])).
				transpose()
			
			final_rank_vector = new_rank_vector + leaked_rank_vector
			diff = SparseMatrix.sum(
				abs(final_rank_vector - initial_rank_vector))
			
			initial_rank_vector = final_rank_vector
			iterations += 1
			print("At iteration: " + str(iterations))
Exemplo n.º 29
0
def tfpredic(testWord):
    # database 1 : คำถามที่ต้องตอบ
    text_list = [
        '''ฝุ่นเยอะ มหาลัยยังเปิดเรียนใช่ไหม''', '''กำหนดการรับนักศึกษา''',
        '''ปีนี้คณะรับนักศึกษากี่คน''', '''สอบเข้า''',
        '''น้ำท่วม เปิดเรียนตามปกติ''', '''ย้ายคณะ''', '''น้ำท่วมมหาลัยหยุด''',
        '''โอนหน่วยกิต''', '''ประชุมอาเซียน มหาลัยปิดเรียน''',
        '''ลาพักการเรียน''', '''เทียบโอนหน่วยกิต''', '''เกียรตินิยม''',
        '''เปลี่ยนชื่อ''', '''ถามเรื่องกู้กยศ''', '''โควิดระบาด มหาลัยหยุด''',
        '''ไวรัสระบาด มหาลัยปิดเรียน''', '''รับนักศึกษากี่คนปีนี้''',
        '''ปีนี้รับนักศึกษาเยอะ''', '''ปีนี้รับ นศ เยอะ''',
        '''วันรับปริญญามีเรียนหรือเปล่า''',
        '''งานรับปริญญา มหาลัยปิดเรียนกี่วัน''',
        '''มหาลัยหยุดวันรับปริญญาถึงวันไหน''',
        '''สำนักทะเบียนวันนี้เปิดทำการตามปกติ''',
        '''สำนักทะเบียนพรุ่งนี้เปิดทำการตามปกติ''',
        '''สหกิจเทอมนี้ไปทำไม่ได้ ทำยังไง''', '''ฝึกงานได้ช่วงไหน''',
        '''หนังสือรับรองจบได้เมื่อไหร่''', '''สหกิจศึกษาเลื่อน''',
        '''สหกิจยังทำได้''', '''วันที่ สำนักทะเบียนทำการปกติ''', '''กยศ''',
        '''กู้กยศ''', '''ก.ย.ศ.''', '''กยศ.''', '''สอบ''', '''สอบโทอิค''',
        '''จ่ายเงิน''', '''ไปฝึกงานไม่ได้เทอมนี้ ควรทำอย่างไร''',
        '''น้ำท่วม หยุดเรียน''', '''กำหนดการต่างๆของมหาลัยยังเหมือนเดิมใช่''',
        '''วันที่ ปิดปรับปรุงเว็บ reg ''',
        '''ปิดปรับปรุงเว็บสำนักทะเบียนเมื่อไหร่''', '''เลื่อนระยะเวลาสอบ''',
        '''เลื่อนปิดภาคเรียน''', '''ซัมเมอร์ยังลงทะเบียนวันเดิม''',
        '''ภาคเรียนที่ 3 ยังลงทะเบียนวันเดิม''',
        '''ภาคเรียนที่สามวันลงทะเบียนเหมือนเดิม''', '''เลื่อนวันปิดเทอม''',
        '''มหาลัยเรียนออนไลน์อย่างไม่มีกำหนด''', '''หน่วยกิต''', testWord
    ]
    # database 2 : คำถามที่ไม่ต้องตอบ
    text_list2 = [
        '''คณะวิทยาศาสต์อยู่ตรงไหน''', '''เซเว่นไปทางไหน''',
        '''ไอทีมีเซเว่นไหม''', '''ในมอมีเซเว่นที่ไหนบ้าง''',
        '''โรงพยาบาลลาดกระบังไปทางไหน''', '''เย็นนี้ทานข้าวไหน''',
        '''วันนี้อาจารย์มีประชุม''', '''ระบบช้ามาก''', '''ขอดูเกรดเทอมนี้''',
        '''ลงทะเบียนไปแล้วกี่หน่วยกิต''', '''เหลืออีกหน่วยกิตกว่าจะจบ''',
        '''เหลือวิชาเลือกต้องลงอีกกี่ตัว''', '''หอในไปทางไหน''',
        '''เมื่อไหร่โควิดจะหาย''', '''เกรดจะออกครบทุกวิชาเมื่อไหร่''',
        '''วิชา อาจารย์ส่งเกรดหรือยัง''', '''สำนักคอมไปทางไหน''',
        '''เมื่อไหร่ระบบคำนวณเกรดจะเสร็จ''', '''เว็บล่มบ่อย เป็นอะไรนักหนา''',
        '''ร้านถ่ายเอกสารอยู่ไหน''', '''คณะวิศวะไปทางไหน''',
        '''คณะคุรุไปทางไหน''', '''แล้วไปเที่ยวกัน''', '''ธนาคารอยู่ไหน''',
        '''เกรดจะออกเมื่อไหร่''', '''ขอดูตารางเรียนส่วนบุคคล''',
        '''ขอดูตารางสอบส่วนตัว''', '''สถาปัตยกรรมศาสตร์ไปทางไหน''',
        '''ตึกพระเทพไปทางไหน''', '''ตึกพระจอมเกล้าอยู่ตรงไหน''',
        '''ตึกกลางน้ำอยู่ตรงไหน''', '''gpax''', '''ผลการเรียน''',
        '''ห้องน้ำไปทางไหนคะ''', '''หิวข้าวอยากกินข้าวมากๆๆๆ''',
        '''คณะไอทีอยู่ตรงไหน''', '''คะแนนสอบออกเมื่อไหร่''',
        '''มีรายวิชาอะไรส่งเกรดแล้วบ้าง''', '''อาจารย์ ออกจากคณะหรือยัง''',
        '''บริหารอยู่ตรงไหน''', '''เทคโนเกษตรอยู่ตรงไหน''',
        '''ส่งงานอาจารย์ ที่ไหน''', '''มีช่องทางติดต่ออาจารย์''',
        '''ตึกศิลปศาสตร์อยู่ตรงไหน''', '''ห้องอธิการบดีอยู่ที่ไหน''',
        '''ห้องอาจารย์อยู่ที่ไหน''', '''ผลการเรียนเทอมล่าสุด''',
        '''กินข้าวไปเที่ยวอาบน้ำ''', '''กินข้าวกันไหมเย็นนี้อยากกินมาก''',
        '''ตึกโหลอยู่ไหน''', testWord
    ]

    tokens_list = [split_word(txt) for txt in text_list]  #รวม 2 List
    tokens_list_j = [','.join(tkn) for tkn in tokens_list]

    tvec = TfidfVectorizer(analyzer=lambda x: x.split(','), )
    t_feat = tvec.fit_transform(tokens_list_j)

    tokens_list2 = [split_word(txt)
                    for txt in text_list2]  #ตัดแต่ละประโยคให้เป็นคำ
    tokens_list_j2 = [
        ','.join(tkn) for tkn in tokens_list2
    ]  #นำแต่ละคำมารวมกัน ให้คิดเป็นแต่ละประโยค แล้วคั่นด้วย ,

    tvec2 = TfidfVectorizer(analyzer=lambda x: x.split(','), )
    t_feat2 = tvec.fit_transform(tokens_list_j2)

    #หาค่า tfidf ของแต่ละคำในประโยค
    score1 = csr_matrix.sum(t_feat[-1, :])
    score2 = csr_matrix.sum(t_feat2[-1, :])

    if score1 < score2:
        return 1  # database 1 : คำถามที่ต้องตอบ
    else:
        return 2  # database 2 : คำถามที่ไม่ต้องตอบ
def char_counts(df):
    #This function creates columns for each character and counts the times it
    #appears during each study session/day

    print 'Calculating time since character last read, etc...'

    #Need to create corpus of characters found in all text_read
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer(decode_error='strict', analyzer='char')
    corpus = df.loc[:, 'text_read']
    dtm = vectorizer.fit_transform(corpus)

    import numpy as np
    from itertools import chain
    import datetime
    from scipy.sparse import csr_matrix

    n = df.shape[0]
    df.loc[:, 'percent_seen'] = 0.0
    df.loc[:, 'mean_days_since'] = 0.0
    df.loc[:, 'mean_term_freq'] = 0.0
    for i in range(1, n):  #cycle through all rows except first row
        ##Get percent of characters not seen in text so far
        prior_non_zero = dtm[:i, :].nonzero(
        )  #Find non-zero values in sparse matrix in (i-1) records
        before_chars = np.unique(
            prior_non_zero[1]
        )  #Get list of all characters that have been seen so far
        current_chars = np.sort(
            dtm[i, :].nonzero()
            [1])  #Find non-zero characters in current record as column #'s
        #http://stackoverflow.com/questions/28901311/numpy-find-index-of-elements-in-one-array-that-occur-in-another-array
        matching_current_index = np.where(np.in1d(current_chars,
                                                  before_chars))[0]
        df.loc[i, 'percent_seen'] = float(
            matching_current_index.shape[0]) / float(current_chars.shape[0])

        ##Get mean days since characters last read (for those already seen in text)
        #http://stackoverflow.com/questions/10252766/python-numpy-get-array-locations-of-a-list-of-values
        #http://stackoverflow.com/questions/11860476/how-to-unnest-a-nested-list

        #gets list of tuple arrays (1 array per char in matching chars) where each array gives the indices of
        #prior_non_zero where that character can be found
        matching_chars = current_chars[matching_current_index]
        prior_array_indices = [
            np.where(prior_non_zero[1] == k) for k in list(matching_chars)
        ]
        prior_array_indices = list(chain(*prior_array_indices))
        last_date_indices = map(lambda x: max(x), prior_array_indices)
        last_date_rows = prior_non_zero[0][last_date_indices]
        current_date = df.loc[i, 'date']
        days_since_seen = map(lambda x: current_date - x,
                              df.loc[last_date_rows, 'date'])
        df.loc[i, 'mean_days_since'] = (
            sum(days_since_seen, datetime.timedelta(0)).total_seconds() /
            86400.0 / (len(days_since_seen)))

        ##Get mean frequency of document terms in the corpus so far
        #    NOT including the text read during the study session
        denominator = float(csr_matrix.sum(dtm[:i, :]))
        numerator = csr_matrix.sum(dtm[:i, matching_current_index])
        df.loc[i, 'mean_term_freq'] = numerator / denominator

    #Normalize the current features
    norm_feat_list = ['cum_time', 'cum_char', 'mean_days_since']
    df = normalize_features(df, norm_feat_list)

    #Create interaction terms with cumulative time and character count features
    df.loc[:,
           'timeXper_seen'] = df.loc[:,
                                     'norm_cum_time'] * df.loc[:,
                                                               'percent_seen']
    df.loc[:,
           'timeXdays_since'] = df.loc[:,
                                       'norm_cum_time'] * df.loc[:,
                                                                 'norm_mean_days_since']
    df.loc[:,
           'timeXterm_freq'] = df.loc[:,
                                      'norm_cum_time'] * df.loc[:,
                                                                'mean_term_freq']

    return df