def Train(self, X, Y): #TODO: Estimate Naive Bayes model parameters positive_indices = np.argwhere(Y == 1.0).flatten() negative_indices = np.argwhere(Y == -1.0).flatten() #Number of positive negative words self.num_positive_reviews = len(positive_indices) self.num_negative_reviews = len(negative_indices) #Count of positive and negative words self.count_positive = csr_matrix.sum(X[np.ix_(positive_indices)], axis=0) self.count_negative = csr_matrix.sum(X[np.ix_(negative_indices)], axis=0) #Total positive negative words self.total_positive_words = csr_matrix.sum(X[np.ix_(positive_indices)]) self.total_negative_words = csr_matrix.sum(X[np.ix_(negative_indices)]) #Denominator self.deno_pos = float(self.total_positive_words + self.ALPHA * X.shape[1]) self.deno_neg = float(self.total_negative_words + self.ALPHA * X.shape[1]) self.count_positive = (self.count_positive + self.ALPHA) self.count_negative = (self.count_negative + self.ALPHA) return
def Train(self, X, Y): #TODO: Estimate Naive Bayes model parameters positive_indices = np.argwhere(Y == 1.0).flatten() negative_indices = np.argwhere(Y == -1.0).flatten() #num of positive reviews/ pFiles self.num_positive_reviews = len(positive_indices) #num of negative reviews/ nFiles self.num_negative_reviews = len(negative_indices) #array of positive counts for each word self.count_positive = csr_matrix.sum(X[np.ix_(positive_indices)], axis=0) + self.ALPHA #array of positive counts for each word self.count_negative = csr_matrix.sum(X[np.ix_(negative_indices)], axis=0) + self.ALPHA #total count for all positive words self.total_positive_words = np.sum(self.count_positive) #total count for all negative words self.total_negative_words = np.sum(self.count_negative) #Deno of P(c) Num of positive words + smoothing factor for all words self.deno_pos = self.total_positive_words + self.ALPHA * X.shape[1] #Deno of P(c) Num of negative words + smoothing factor for all words self.deno_neg = self.total_negative_words + self.ALPHA * X.shape[1] # self.count_positive = 1 # self.count_negative = 1 self.pos_recall = [] self.pos_precision = [] self.neg_recall = [] self.neg_precision = [] return
def Train(self, X, Y): #TODO: Estimate Naive Bayes model parameters positive_indices = np.argwhere(Y == 1.0).flatten() negative_indices = np.argwhere(Y == -1.0).flatten() self.num_positive_reviews = len(positive_indices) self.num_negative_reviews = len(negative_indices) self.count_P = np.ix_(positive_indices) self.count_N = np.ix_(negative_indices) self.count_positive = self.count_positive + csr_matrix.sum( X[self.count_P], axis=0) + self.ALPHA self.count_negative = self.count_negative + csr_matrix.sum( X[self.count_N], axis=0) + self.ALPHA self.total_positive_words = csr_matrix.sum(X[self.count_P]) self.total_negative_words = csr_matrix.sum(X[self.count_N]) self.deno_pos = float(self.total_positive_words + self.ALPHA * X.shape[1]) self.deno_neg = float(self.total_negative_words + self.ALPHA * X.shape[1]) return
def build_grad(self, v): try: logger.info("build_grad --------------> ") # Empirical counts, convert matrix to vector empirical_counts = np.squeeze( np.asarray(csr_matrix.sum(self.training_matrix, axis=0))) # Expected counts expected_counts = np.zeros(self.feature_vector_len) for (i, j) in self.all_ones_positions_in_the_feature_vector: mat = self.all_ones_positions_in_the_feature_vector[(i, j)] # Calculating all probabilities at once per each (i,j) --> x(i) nominators = np.exp(mat.dot(v)) denominator = np.sum(nominators) prob = nominators / denominator expected_counts += mat.transpose().dot(prob) logger.info("build_grad <-------------- ") return -(empirical_counts - expected_counts - self.lambda_reg * v) except Exception as e: print e
def __init__(self, weight_graph, labels): # 2D Array of weights if (np.amin(weight_graph) < 0): raise ValueError('Negative weights inputed') else: self.weight_matrix = csr_matrix(weight_graph) # Inputted labels. Unlabeled points should be -1 , classes ranging from 0 up if (len(labels.shape) == 1): self.class_labels = labels self.number_of_classes = np.amax(labels) + 1 if (len(labels.shape) > 1): self.class_labels = labels self.number_of_classes = labels.shape[1] self.node_number = self.weight_matrix.shape[0] self.y_matrix_creation() # Stores degrees of nodes in array self.degrees = csr_matrix.sum(self.weight_matrix, axis=1) self.degrees = np.asarray(self.degrees).reshape(-1) for i in range(self.node_number): if (self.degrees[i] == 0): self.degrees[i] += np.nextafter(np.float32(0), np.float32(1))
def training(num_words, X_train, y_train, alpha): ''' This function trains the naive bayes algorithm by calculating the conditional and class probabilities''' # initialize probability matrix prob_mat = np.zeros([X_train.shape[1], 20]) # calculate the relative frequencies freq_bins = np.asarray(np.bincount(y_train), dtype=float) prob_freq = np.log(freq_bins / np.sum(freq_bins)) # for each class in the dataset for k in range(0, 20): # find the cases corresponding to that class y_train_k_ind = np.where(y_train == k)[0] # extract from the training matrix X_train_k = X_train[y_train_k_ind] # calculate the numerator (note that this is vectorized) numerator_k = np.array(csr_matrix.sum(X_train_k, axis=0) + alpha, dtype=float) # the associated conditional probabilities log_probs_k = np.log( np.transpose(np.array(numerator_k / np.sum(numerator_k)))) # insert in the storage matrix prob_mat[:, [k]] = log_probs_k return prob_freq, prob_mat
def eq1(Wcm, Wuu, Wl, H, T, ak, wf): # sku = 0.05 # suu = 0.01 # sl = 1 # lamd = 100 sku = 0.05 suu = 0.01 sl = 1 lamd = 100 X = csr.sum(Wcm, axis=1) #numpy matrix Dcm = diags(X.A.ravel()).tocsr() X = csr.sum(Wuu, axis=1) #numpy matrix Duu = diags(X.A.ravel()).tocsr() X = csr.sum(Wl, axis=1) #numpy matrix Dl = diags(X.A.ravel()).tocsr() Lifm = csr.transpose(Dcm - Wcm).dot(Dcm - Wcm) + suu * (Duu - Wuu) + sl * (Dl - Wl) # Lifm = suu*(Duu-Wuu) + sl*(Dl-Wl) A = Lifm + lamd * T + sku * H b = (lamd * T + sku * H).dot(wf) # print(csr.sum(b)) M = diags(A.diagonal()) # print(A.shape) # print(b.shape) alpha = cg(A, b, x0=wf, tol=1e-05, maxiter=100, M=None, callback=None, atol=None) # alpha = spsolve(A, b) # print(alpha) # print(type(alpha[0])) return alpha[0] * 255
def apply_redundancy_penalty(self, selected_sentence, sentences): """ Apply a redundancy penalty to all sentences based on the given selected sentence :param selected_sentence: the selected sentence :return: void """ selected_vector = selected_sentence.vector for sentence in sentences: overlap = csr_matrix.sum((selected_vector != 0).multiply(sentence.vector != 0)) counts = selected_vector.sum() + sentence.vector.sum() sentence.mead_score = sentence.mead_score - (overlap/counts)
def generarte_autocomplete_vocab(): with open("vocab_to_ix.json", 'r') as f: data = json.load(f) vocab = [str(key) for key, val in data.iteritems()] f.close() with open('tfidf_mat.npz', 'r') as f1: tfidf_mat = load_npz(f1) co_occurence_mat = (tfidf_mat.T) * tfidf_mat f1.close() with open('tfidf_mat.npz', 'r') as f1: with open("ix_to_vocab.json", 'r') as f2: tfidf_mat = load_npz(f1) ix_to_vocab = json.load(f2) sum_arr = csr_matrix.sum(tfidf_mat, axis=0) x = np.argsort(sum_arr) words_arr = [] for ix in range(116754): val = x[0, ix] word = ix_to_vocab[str(val)] words_arr.append(word.encode("utf8")) f1.close() f2.close() low_bound = len(words_arr) - 1000 word_refined = words_arr[low_bound:] no_ints = [word for word in word_refined if not word.isdigit()] with open("vocab_to_ix.json", 'r') as f: with open("ix_to_vocab.json", 'r') as f2: ix_to_vocab = json.load(f2) vocab_to_ix = json.load(f) bigrams = [] for word in no_ints: ix = vocab_to_ix[word] sorted_row = np.argsort( co_occurence_mat[ix, :].toarray()[0])[::-1] #print(sorted_row) #First index is the same word, so take the second and third word ix1, ix2 = sorted_row[1], sorted_row[2] bigram_1 = word.encode("utf8") + " " + ix_to_vocab[str( ix1)].encode("utf8") bigram_2 = word.encode("utf8") + " " + ix_to_vocab[str( ix2)].encode("utf8") bigrams.append(bigram_1) bigrams.append(bigram_2) f.close() f2.close() with open("autocomplete_bigram_vocab.pickle", "wb") as outfile: pickle.dump(bigrams, outfile)
def Train(self, X, Y): pos_indices = np.argwhere(Y == 1.0).flatten() neg_indices = np.argwhere(Y == -1.0).flatten() self.pos_rev = len(pos_indices) self.neg_rev = len(neg_indices) self.count_pos = csr_matrix.sum(X[np.ix_(pos_indices)], axis=0) + self.ALPHA self.count_neg = csr_matrix.sum(X[np.ix_(neg_indices)], axis=0) + self.ALPHA self.total_pos = csr_matrix.sum(X[np.ix_(pos_indices)]) self.total_neg = csr_matrix.sum(X[np.ix_(neg_indices)]) self.deno_pos = float(self.total_pos + self.ALPHA * X.shape[1]) self.deno_neg = float(self.total_neg + self.ALPHA * X.shape[1]) samples = self.samples valid = 0 weight_trans = np.zeros([X.shape[1], 1]) converged = 1 for j in range(samples): term = (X[j].dot(weight_trans)) valid = 0 if (term > 0.0): valid = 1.0 elif term < 0.0: valid = -1.0 if Y[j] != valid: weight_trans += (Y[j] * X[j].transpose()) self.for_avg_weight += Y[j] converged = 0 if converged == 1: break self.weight = weight_trans.transpose() return
def matrix_get_topicSpecificRank(self, teleport_set, initial_rank_vector, google_matrix): """Calculates TopicSpecificRank of each node taking some related_pages as `teleport_set`. This method works by applying power iteration until convergence or till iterations reach `MAX_ITERATIONS`, whichever happens first. [USAGE WARNING] : If graph is large, then sparse matrix may become huge and use up the entire RAM(which is not a condition to be in). ... Parameters ---------- teleport_set : list of int List of pages to which a random walker in the web-graph can teleport to. In TopicSpecificRank this set corresponds to pages of same topic. initial_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), n is `node_num`] Ranks are distributed equally among all pages, initially. google_matrix : scipy.sparse.csr_matrix [shape = (n x n), n is `node_num`] It contains proportion of rank that will propagate from a page to another page. Returns ------- final_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), n is `node_num`] Contains TopicSpecificRank of each node in the web-graph. """ iterations = 0 diff = math.inf teleport_set_size = len(teleport_set) final_rank_vector = SparseMatrix(np.zeros(self.node_num).transpose()) while(iterations < self.MAX_ITERATIONS and diff > self.epsilon): new_rank_vector = google_matrix * initial_rank_vector leaked_rank = (1-SparseMatrix.sum(new_rank_vector))/ teleport_set_size leaked_rank_vector = SparseMatrix(np.array([leaked_rank if node in teleport_set else 0 for node in range(self.node_num)])). transpose()
def squaredis(P,Cent): d=Cent.shape[1] C=SM((Cent.shape[0],d+2)) C[:,1]=1 #C is defined just as in the algorithm you sent me. C[:,0] =SM.sum(SM.power(Cent, 2), 1) C[:,2:d+2]=Cent D=SM.dot(P,C.T) D=D.toarray() Tags=D.argmin(1)#finding the most close centroid for each point if min(D.shape)>1: dists=D.min(1) else: dists=np.ravel(D) y=D.argmin(0) return dists,Tags,y
def _graph_node_vectorize(graph, decomposition_funcs, preprocessors=None, nbits=16, effective_radius=1, cutoff_effective_radius_factor=2, max_num_node_features=1, weight=None, type_of='shortest', attribute_label=None): data_matrix = node_proximity_node_vectorize( graph, decomposition_funcs, preprocessors, nbits, effective_radius, cutoff_effective_radius_factor, max_num_node_features, weight, type_of, attribute_label) vec = csr_matrix(csr_matrix.sum(data_matrix, axis=0)) return vec
def correlation_filter(p, all_vars, quantile_filter=0.25): """Calculates correlations between phenotype and variants, giving those that are above the specified quantile Args: p (pandas.DataFrame) Phenotype vector (n, 1) all_vars (scipy.sparse.csr_matrix) Narrow sparse matrix representation of all variants to fit to (rows = variants, columns = samples) quantile_filter (float) The quantile to discard at e.g. 0.25, retain top 75% [default = 0.25] Returns: cor_filter (numpy.array) The indices of variants passing the filter """ # a = snp - mean(snp) # b = y - mean(y) # cor = abs(a%*%b / sqrt(sum(a^2)*sum(b^2)) ) b = p.values - np.mean(p.values) sum_b_squared = np.sum(np.power(b, 2)) # NOTE: I couldn't get this to multithread efficiently using sparse matrices... # might work if the matrix was divided into chunks of rows first, but maybe not # worth it as it's pretty quick anyway correlations = [] for row_idx in tqdm(range(all_vars.shape[0]), unit="variants"): k = all_vars.getrow(row_idx) k_mean = csr_matrix.mean(k) if k_mean == 0: # avoid crashes due to an empty sparse vector correlations.append([np.nan]) else: ab = k.dot(b) - np.sum(k_mean * b) sum_a_squared = k.dot( k.transpose()).data[0] - 2 * k_mean * csr_matrix.sum(k) + pow( k_mean, 2) * all_vars.shape[1] cor = np.abs(ab / np.sqrt(sum_a_squared * sum_b_squared)) correlations.append(cor) cor_filter = np.nonzero( correlations > np.percentile(correlations, quantile_filter * 100))[0] return (cor_filter)
def train(self, X, y, word_vocab): """ Train on the sparse document-term matrix X and associated labels y. In the test case below, p_wc is a class-term-matrix and has a row for each class and a column for each term. So the value at ij is the p_wc for the j-th term in the i-th class. p_c is an array of global probabilities for each class. >>> wv, cv = generate_vocab("example.txt") >>> X, y = read_labeled_data("example.txt", cv, wv) >>> nb = NaiveBayes(cv, wv) >>> nb.train(X, y, wv) >>> numpy.round(nb.p_wc, 3) array([[ 0.664, 0.336], [ 0.335, 0.665]]) >>> numpy.round(nb.p_c, 3) array([ 0.5, 0.5]) """ total = numpy.unique(y) print(total) self.classes = total matrices = [] for i, label in enumerate(total): self.p_c.append((y == label).sum() / len(y)) matrices.append(X[y == label]) for i, matrix in enumerate(matrices): column_summed_matrix = csr_matrix.sum(matrix, axis=0) n_vocab = len(word_vocab.keys()) nC = column_summed_matrix.sum() + self.e * n_vocab print(nC) column_summed_matrix += self.e if i == 0: p_row = numpy.divide(column_summed_matrix, nC) else: p_row_onwards = numpy.divide(column_summed_matrix, nC) p_row = numpy.concatenate((p_row, p_row_onwards)) print(p_row) self.p_wc = p_row self.log_p_wc = numpy.log(self.p_wc) print(self.log_p_wc)
def fit(self, X, y): check_X_y(X, y, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) check_array(X, accept_sparse=['csc', 'csr', 'coo', 'dok', 'bsr', 'lil', 'dia']) self.X_ = X check_classification_targets(y) classes = np.nonzero(y) n_samples, n_classes = len(y), len(classes) # create diagonal matrix of degree of nodes if sparse.isspmatrix(self.X_): B_ = self.X_.copy().astype(np.float) D = np.array(csr_matrix.sum(self.X_, axis=1), dtype=np.float).T[0] else: B_ = np.copy(self.X_).astype(np.float) D = np.array(np.sum(self.X_, axis=1), dtype=np.float) # if (- self.sigma) and (self.sigma - 1) doesn't equals we have different diagonal matrix at the left and right sides if (- self.sigma) == (self.sigma - 1): D_left = D_right = np.power(D, - self.sigma) else: D_left = np.power(D, - self.sigma) D_right = np.power(self.sigma - 1) # M_ = D_left.dot(B_) for i, d in enumerate(D_left): B_[i, :] *= d # B_ = M_.dot(D_right) for i, d in enumerate(D_right): B_[:, i] *= d # create labeled data Z dimension = (n_samples, n_classes) labels = np.nonzero(y) ans_y = np.zeros(dimension) for l in labels[0]: ans_y[l][y[l] - 1] = 1 Z_ = (self.sigma / (1 + self.sigma)) * ans_y self.initial_vector_ = np.ones(dimension) / n_classes self._get_method_(B_, Z_) return self
def spectral_cluster(G, node_list): # G is a similarity matrix S = nx.to_scipy_sparse_matrix(G, nodelist=node_list) previous_sum_cut = 0 previous_cluster_node = {} previous_cluster_label = {} for i in range(2, 100): labels = spectral_clustering(S, n_clusters=i) labels = labels.tolist() # print(labels) result_cluster_node = dict(zip(node_list, labels)) result_cluster_label = {} for k in result_cluster_node: v = result_cluster_node[k] if v in result_cluster_label: result_cluster_label.get(v).add(k) else: result_cluster_label[v] = {k} # print(result_cluster_label) sum_cut = 0 for k in result_cluster_label: cut_k = 0 vol_k = 0 v = result_cluster_label[k] for nk in v: set_not_k = set(node_list).difference(v) vol_k += csr_matrix.sum(S.getcol(node_list.index(nk))) # print(nk, S.getcol(cited_list.index(nk)).toarray().tolist()) for notk in set_not_k: cut_k += G.get_edge_data(nk,notk,default={"weight":0})["weight"] # print(cut_k, vol_k) sum_cut += (cut_k/vol_k) if sum_cut > previous_sum_cut != 0 or i == 99: print(i, sum_cut, result_cluster_label) return {"result_by_node": previous_cluster_node, "result_by_cluster": previous_cluster_label} break else: previous_cluster_node = result_cluster_node previous_cluster_label = result_cluster_label previous_sum_cut = sum_cut
def gradient(self, w): """ this method calculates the gradient of the weight vector :param w: :return: """ # empirical counts that are converted from matrix to vector empirical_counts = np.squeeze( np.asarray(csr_matrix.sum(self.model.train_feature_matrix, axis=0))) # expected counts expected_counts = np.zeros(self.model.feature_vector_len) for node_idx, node_matrix in self.model.possible_genres_per_node_matrix.items( ): nominators = np.exp(node_matrix.dot(w)) denominator = np.sum(nominators) prob = nominators / denominator expected_counts += node_matrix.transpose().dot(prob) return -(empirical_counts - expected_counts - self.lambda_ * w)
def char_counts(df): #This function creates columns for each character and counts the times it #appears during each study session/day print 'Calculating time since character last read, etc...' #Need to create corpus of characters found in all text_read from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(decode_error = 'strict', analyzer = 'char') corpus = df.loc[:,'text_read'] dtm = vectorizer.fit_transform(corpus) import numpy as np from itertools import chain import datetime from scipy.sparse import csr_matrix n = df.shape[0] df.loc[:, 'percent_seen'] = 0.0 df.loc[:, 'mean_days_since'] = 0.0 df.loc[:, 'mean_term_freq'] = 0.0 for i in range(1, n): #cycle through all rows except first row ##Get percent of characters not seen in text so far prior_non_zero = dtm[:i,:].nonzero() #Find non-zero values in sparse matrix in (i-1) records before_chars = np.unique(prior_non_zero[1]) #Get list of all characters that have been seen so far current_chars = np.sort(dtm[i,:].nonzero()[1]) #Find non-zero characters in current record as column #'s #http://stackoverflow.com/questions/28901311/numpy-find-index-of-elements-in-one-array-that-occur-in-another-array matching_current_index = np.where(np.in1d(current_chars, before_chars))[0] df.loc[i,'percent_seen'] = float(matching_current_index.shape[0])/float(current_chars.shape[0]) ##Get mean days since characters last read (for those already seen in text) #http://stackoverflow.com/questions/10252766/python-numpy-get-array-locations-of-a-list-of-values #http://stackoverflow.com/questions/11860476/how-to-unnest-a-nested-list #gets list of tuple arrays (1 array per char in matching chars) where each array gives the indices of #prior_non_zero where that character can be found matching_chars = current_chars[matching_current_index] prior_array_indices = [np.where(prior_non_zero[1] == k) for k in list(matching_chars)] prior_array_indices = list(chain(*prior_array_indices)) last_date_indices = map(lambda x: max(x), prior_array_indices) last_date_rows = prior_non_zero[0][last_date_indices] current_date = df.loc[i,'date'] days_since_seen = map(lambda x: current_date - x, df.loc[last_date_rows, 'date']) df.loc[i,'mean_days_since'] = (sum(days_since_seen, datetime.timedelta(0)).total_seconds() / 86400.0 / (len(days_since_seen))) ##Get mean frequency of document terms in the corpus so far # NOT including the text read during the study session denominator = float(csr_matrix.sum(dtm[:i,:])) numerator = csr_matrix.sum(dtm[:i, matching_current_index]) df.loc[i, 'mean_term_freq'] = numerator / denominator #Normalize the current features norm_feat_list = ['cum_time', 'cum_char', 'mean_days_since'] df = normalize_features(df, norm_feat_list) #Create interaction terms with cumulative time and character count features df.loc[:,'timeXper_seen'] = df.loc[:, 'norm_cum_time'] * df.loc[:,'percent_seen'] df.loc[:,'timeXdays_since'] = df.loc[:, 'norm_cum_time'] * df.loc[:,'norm_mean_days_since'] df.loc[:,'timeXterm_freq'] = df.loc[:, 'norm_cum_time'] * df.loc[:,'mean_term_freq'] return df
Pnmatrix1 = PN_cell_towers.as_matrix(columns=None) i = 0 matrix = np.array([]) while i < Pnmatrix1.shape[0]: # 0-191 move1 = Pnmatrix1[i, :] matrix = np.append(matrix, np.sqrt(((move1[0] - Pnmatrix1[:, 0]) * 110.90444)**2 + ((move1[1] - Pnmatrix1[:, 1]) * 93.45318)**2)) i = i + 1 newmatrix = np.reshape(matrix, (Pnmatrix1.shape[0], Pnmatrix1.shape[0])) sA = csr_matrix(newmatrix) Tcsr = minimum_spanning_tree(sA) Tcsr = Tcsr.toarray() Tcsr = csr_matrix(Tcsr) print(csr_matrix.sum(Tcsr)) cellnetPN = csr_matrix.sum(Tcsr) Pnmatrix2 = NB_cell_towers.as_matrix(columns=None) j = 0 matrix2 = np.array([]) while j < Pnmatrix2.shape[0]: # 0-191 move2 = Pnmatrix2[j, :] matrix2 = np.append(matrix2, np.sqrt(((move2[0] - Pnmatrix2[:, 0]) * 110.57483725)**2 + ((move2[1] - Pnmatrix2[:, 1]) * 111.29134198)**2)) j = j + 1 newmatrix2 = np.reshape(matrix2, (Pnmatrix2.shape[0], Pnmatrix2.shape[0])) sA2 = csr_matrix(newmatrix2) Tcsr2 = minimum_spanning_tree(sA2)
save_object_tofile("y_train.csv", y_train.values) save_object_tofile("y_test.csv", y_test.values) tmpe = Xt.dot(X.T) save_sparse_csr("similarity.npz", tmpe) del tmpe start = timeit.default_timer() l = [] for i in range(Xt.shape[0]): print(i) ve = Xt[i].dot(X.T) r = [] w = [] e = [] for m in range(ve.getnnz()): if ve.data[m] >= E: j = ve.indices[m] dist = csr_matrix.sum(csr_matrix.power(Xt[i] - X[j], 2)) w.append(1 / dist) r.append(y_train.values[j]) #y_train.index e.append(ve.data[m]) l.append(heapq.nlargest(K, zip(w, r, e), key=lambda s: s[0])) del r, w, e, ve save_object_tofile("tuple_list.dat", l) stop = timeit.default_timer() print("Time:", stop - start) rst = predict_analyse("tuple_list.dat") y_predicted = [i[3] for i in rst] print_test_result(y_test, y_predicted) #save_result("result.dat",y_predicted)
def compute_support(m): N = m.shape[1] return csr.sum(m)/(N*((N-1)/2))
def mcmc(G, iter, nburn, w0=False, beta=False, n=False, u=False, sigma=False, c=False, t=False, tau=False, x=False, hyperparams=False, wnu=False, all=False, sigma_sigma=0.01, sigma_c=0.01, sigma_t=0.01, sigma_tau=0.01, sigma_x=0.01, a_t=200, b_t=1, epsilon=0.01, R=5, w_inference='HMC', save_every=1000, init='none', index=None): size = G.number_of_nodes() prior = G.graph['prior'] if 'prior' in G.graph else print( 'You must specify a prior as attribute of G') gamma = G.graph['gamma'] if 'gamma' in G.graph else print( 'You must specify spatial exponent gamma as attribute of G') size_x = G.graph['size_x'] if 'size_x' in G.graph else print( 'You must specify size_x as attribute of G') if hyperparams is True or all is True: sigma = c = t = tau = True if prior == 'singlepl': tau = False if wnu is True or all is True: w0 = beta = n = u = x = True if prior == 'singlepl': beta = False if sigma is True: sigma_est = [init['sigma_init'] ] if 'sigma_init' in init else [float(np.random.rand(1))] else: sigma_est = [G.graph['sigma']] if c is True: c_est = [init['c_init'] ] if 'c_init' in init else [float(5 * np.random.rand(1) + 1)] else: c_est = [G.graph['c']] if t is True: t_est = [init['t_init']] if 't_init' in init else [ float(np.random.gamma(a_t, 1 / b_t)) ] else: t_est = [G.graph['t']] if prior == 'doublepl': if tau is True: tau_est = [init['tau_init']] if 'tau_init' in init else [ float(5 * np.random.rand(1) + 1) ] else: tau_est = [G.graph['tau']] else: tau_est = [0] z_est = [(size * sigma_est[0] / t_est[0]) ** (1 / sigma_est[0])] if G.graph['prior'] == 'singlepl' else \ [(size * tau_est[0] * sigma_est[0] ** 2 / (t_est[0] * c_est[0] ** (sigma_est[0] * (tau_est[0] - 1)))) ** \ (1 / sigma_est[0])] if w0 is True: if 'w0_init' in init: w0_est = [init['w0_init']] else: g = np.random.gamma(1 - sigma_est[0], 1, size) unif = np.random.rand(size) w0_est = [ np.multiply( g, np.power(((z_est[0] + c_est[0])**sigma_est[0]) * (1 - unif) + (c_est[0]**sigma_est[0]) * unif, -1 / sigma_est[0])) ] else: w0_est = [ np.array([G.nodes[i]['w0'] for i in range(G.number_of_nodes())]) ] if prior == 'doublepl' and beta is True: beta_est = [init['beta_init']] if 'beta_init' in init else [ float(np.random.beta(sigma_est[0] * tau_est[0], 1)) ] if prior == 'singlepl' or beta is False: beta_est = [np.array([G.nodes[i]['beta'] for i in range(G.number_of_nodes())])] if 'beta' in G.nodes[0] \ else [np.ones((size))] if u is True: u_est = [init['u_init']] if 'u_init' in init else [ tp.tpoissrnd(z_est[0] * w0_est[0]) ] else: u_est = [ np.array([G.nodes[i]['u'] for i in range(G.number_of_nodes())]) ] if x is True: x_est = [init['x_init'] ] if 'x_init' in init else [size_x * np.random.rand(size)] p_ij_est = [aux.space_distance(x_est[-1], gamma)] else: if gamma != 0: x_est = [ np.array([G.nodes[i]['x'] for i in range(G.number_of_nodes())]) ] p_ij_est = [aux.space_distance(x_est[-1], gamma)] else: p_ij_est = [np.ones((size, size))] if 'ind' in G.graph: ind = G.graph['ind'] else: ind = {k: [] for k in G.nodes} for i in G.nodes: for j in G.adj[i]: if j > i: ind[i].append(j) if 'selfedge' in G.graph: selfedge = G.graph['selfedge'] else: selfedge = [i in ind[i] for i in G.nodes] selfedge = list(compress(G.nodes, selfedge)) if n is True: if 'n_init' in init: n_est = [init['n_init']] else: out_n = up.update_n(w0_est[0], G, size, p_ij_est[-1], ind, selfedge) n_est = [out_n[0]] else: n_est = [G.graph['counts']] w_est = [np.exp(np.log(w0_est[0]) - np.log(beta_est[0]))] adj = n_est[-1] > 0 log_post_param_est = [ aux.log_post_params(prior, sigma_est[-1], c_est[-1], t_est[-1], tau_est[-1], w0_est[-1], beta_est[-1], u_est[-1], a_t, b_t) ] sum_n = np.array( csr_matrix.sum(n_est[-1], axis=0) + np.transpose(csr_matrix.sum(n_est[-1], axis=1)))[0] log_post_est = [ aux.log_post_logwbeta_params(prior, sigma_est[-1], c_est[-1], t_est[-1], tau_est[-1], w_est[-1], w0_est[-1], beta_est[-1], n_est[-1], u_est[-1], p_ij_est[-1], a_t, b_t, gamma, sum_n, adj, log_post_par=log_post_param_est[-1])[0] ] print('log post initial', log_post_est[-1]) accept_params = [0] accept_hmc = 0 accept_distance = [0] rate = [0] rate_p = [0] step = 100 nadapt = 1000 sigma_prev = sigma_est[-1] c_prev = c_est[-1] t_prev = t_est[-1] tau_prev = tau_est[-1] w_prev = w_est[-1] w0_prev = w0_est[-1] beta_prev = beta_est[-1] n_prev = n_est[-1] if gamma != 0: x_prev = x_est[-1] p_ij_prev = p_ij_est[-1] u_prev = u_est[-1] z_prev = z_est[-1] p = adj.multiply(p_ij_est[-1]) nlogp = coo_matrix.sum(n_est[-1].multiply( p._with_data(np.log(p.data), copy=True))) nlogw = sum(sum_n * np.log(w_est[-1])) wpw = sum(w_est[-1] * np.dot(p_ij_est[-1], w_est[-1])) uw0 = sum((u_est[-1] - 1) * np.log(w0_est[-1])) sumw0 = sum(np.log(w0_est[-1])) for i in range(iter): # update hyperparameters if at least one of them demands the update if sigma is True or c is True or t is True or tau is True: output_params = up.update_params(prior, sigma_prev, c_prev, t_prev, tau_prev, z_prev, w0_prev, beta_prev, u_prev, log_post_param_est[-1], accept_params[-1], sigma=sigma, c=c, t=t, tau=tau, sigma_sigma=sigma_sigma, sigma_c=sigma_c, sigma_t=sigma_t, sigma_tau=sigma_tau, a_t=a_t, b_t=b_t) sigma_prev = output_params[0] c_prev = output_params[1] t_prev = output_params[2] tau_prev = output_params[3] z_prev = output_params[4] accept_params.append(output_params[5]) log_post_param_est.append(output_params[6]) rate_p.append(output_params[7]) if (i + 1) % save_every == 0 and i != 0: sigma_est.append(sigma_prev) c_est.append(c_prev) t_est.append(t_prev) tau_est.append(tau_prev) z_est.append(z_prev) if i % 1000 == 0: print('update hyperparams iteration ', i) print('acceptance rate hyperparams = ', round(accept_params[-1] / (i + 1) * 100, 1), '%') if (i % step) == 0 and i != 0 and i < nburn: if sigma is True: sigma_sigma = aux.tune(accept_params, sigma_sigma, step) if c is True: sigma_c = aux.tune(accept_params, sigma_c, step) if t is True: sigma_t = aux.tune(accept_params, sigma_t, step) if tau is True: sigma_tau = aux.tune(accept_params, sigma_tau, step) # update w and beta if at least one of them is True if w0 is True: if accept_params[-1] == 0: log_post_est.append(log_post_est[-1]) if accept_params[-1] == 1: temp = aux.log_post_logwbeta_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w_prev, w0_prev, beta_prev, n_prev, u_prev, p_ij_prev, a_t, b_t, gamma, sum_n, adj, log_post_par=log_post_param_est[-1], nlogp=nlogp, nlogw=nlogw, wpw=wpw, uw0=uw0, sumw0=sumw0) log_post_est.append(temp[0]) if w_inference == 'gibbs': output_gibbs = up.gibbs_w(w_prev, beta_prev, sigma_prev, c_prev, z_prev, u_prev, n_prev, p_ij_prev, gamma, sum_n) w_prev = output_gibbs[0] w0_prev = output_gibbs[1] log_post_param_est.append( aux.log_post_params(prior, sigma_prev, c_prev, t_prev, tau_prev, w0_prev, beta_prev, u_prev, a_t, b_t)) temp = aux.log_post_logwbeta_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w_prev, w0_prev, beta_prev, n_prev, u_prev, p_ij_prev, a_t, b_t, gamma, sum_n, adj, log_post=log_post_param_est[-1], nlogp=nlogp) log_post_est.append(temp[0]) ## nlogw = temp[2] wpw = temp[3] uw0 = temp[4] sumw0 = temp[5] ## if i % 1000 == 0 and i != 0: print('update w (gibbs) iteration ', i) if w_inference == 'HMC': output_hmc = up.HMC_w(prior, w_prev, w0_prev, beta_prev, n_prev, u_prev, sigma_prev, c_prev, t_prev, tau_prev, z_prev, gamma, p_ij_prev, a_t, b_t, epsilon, R, accept_hmc, size, sum_n, adj, log_post_est[-1], log_post_param_est[-1], nlogp, nlogw, wpw, uw0, sumw0, update_beta=beta) w_prev = output_hmc[0] w0_prev = output_hmc[1] beta_prev = output_hmc[2] accept_hmc = output_hmc[3] rate.append(output_hmc[4]) log_post_est.append(output_hmc[5]) log_post_param_est.append(output_hmc[6]) ## nlogw = output_hmc[7] wpw = output_hmc[8] uw0 = output_hmc[9] sumw0 = output_hmc[10] ## if i % 100 == 0 and i != 0: # if i < nadapt: if i >= step: # epsilon = np.exp(np.log(epsilon) + 0.01 * (np.mean(rate) - 0.6)) epsilon = np.exp( np.log(epsilon) + 0.01 * (np.mean(rate[i - step:i]) - 0.6)) if i % 1000 == 0: print('update w and beta iteration ', i) print('acceptance rate HMC = ', round(accept_hmc / (i + 1) * 100, 1), '%') print('epsilon = ', epsilon) if (i + 1) % save_every == 0 and i != 0: w_est.append(w_prev) w0_est.append(w0_prev) beta_est.append(beta_prev) # update n step_n = 1 if n is True and (i + 1) % step_n == 0: n_prev = up.update_n(w_prev, G, size, p_ij_prev, ind, selfedge) sum_n = np.array( csr_matrix.sum(n_prev, axis=0) + np.transpose(csr_matrix.sum(n_prev, axis=1)))[0] log_post_param_est.append(log_post_param_est[-1]) temp = aux.log_post_logwbeta_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w_prev, w0_prev, beta_prev, n_prev, u_prev, p_ij_prev, a_t, b_t, gamma, sum_n, adj, log_post_par=log_post_param_est[-1], wpw=wpw, uw0=uw0, sumw0=sumw0) log_post_est.append(temp[0]) ## nlogp = temp[1] nlogw = temp[2] ## if (i + 1) % save_every == 0 and i != 0: n_est.append(n_prev) if i % 1000 == 0: print('update n iteration ', i) # update u if u is True: u_prev = up.posterior_u(z_prev * w0_prev) log_post_param_est.append( aux.log_post_params(prior, sigma_prev, c_prev, t_prev, tau_prev, w0_prev, beta_prev, u_prev, a_t, b_t)) temp = aux.log_post_logwbeta_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w_prev, w0_prev, beta_prev, n_prev, u_prev, p_ij_prev, a_t, b_t, gamma, sum_n, adj, log_post_par=log_post_param_est[-1], nlogp=nlogp, nlogw=nlogw, wpw=wpw, sumw0=sumw0) log_post_est.append(temp[0]) ## uw0 = temp[4] ## if (i + 1) % save_every == 0 and i != 0: u_est.append(u_prev) if i % 1000 == 0: print('update u iteration ', i) step_x = 1 if x is True and (i + 1) % step_x == 0: out_x = up.update_x(x_prev, w_prev, gamma, p_ij_prev, n_prev, sigma_x, accept_distance[-1], prior, sigma_prev, c_prev, t_prev, tau_prev, w0_prev, beta_prev, u_prev, a_t, b_t, sum_n, adj, log_post_est[-1], log_post_param_est[-1], index, nlogw, uw0, sumw0, nlogp, wpw) x_prev = out_x[0] p_ij_prev = out_x[1] accept_distance.append(out_x[2]) log_post_est.append(out_x[3]) ## nlogp = out_x[4] wpw = out_x[5] ## if (i + 1) % save_every == 0 and i != 0: p_ij_est.append(p_ij_prev) x_est.append(x_prev) if i % 1000 == 0: print('update x iteration ', i) print('acceptance rate x = ', round(accept_distance[-1] * 100 * step_x / iter, 1), '%') print('sigma_x = ', sigma_x) if (i % (step / step_x)) == 0 and i != 0 and i < nburn: sigma_x = aux.tune(accept_distance, sigma_x, int(step / step_x)) if gamma != 0: return w_est, w0_est, beta_est, sigma_est, c_est, t_est, tau_est, n_est, u_est, \ log_post_param_est, log_post_est, p_ij_est, x_est else: return w_est, w0_est, beta_est, sigma_est, c_est, t_est, tau_est, n_est, u_est, \ log_post_param_est, log_post_est, p_ij_est
def main(nelx, nely, nelz, volfrac, penal, rmin, heaviside): # USER DEFINED PRINT ORIENTATION baseplate = 'S' # USER DEFINED LOOP PARAMETERS maxloop = 1000 tolx = 0.01 displayflag = 0 # USER DEFINED MATERIAL PROPERTIES E0 = 1 Emin = 1e-9 nu = 0.3 # USER DEFINED LOAD DoFs il, jl, kl = np.meshgrid(nelx, 0, np.arange(nelz + 1)) loadnid = kl * (nelx + 1) * (nely + 1) + il * (nely + 1) + (nely + 1 - jl) loaddof = 3 * np.ravel(loadnid, order='F') - 1 #CURRENTLY A 1D ARRAY (used for sparse later) # USER DEFINED SUPPORT FIXED DOFS iif, jf, kf = np.meshgrid(0, np.arange(nely + 1), np.arange(nelz + 1)) fixednid = kf * (nelx + 1) * (nely + 1) + iif * (nely + 1) + (nely + 1 - jf) fixeddof = np.concatenate((3 * np.ravel(fixednid, order='F'), 3*np.ravel(fixednid, order='F')-1, 3*np.ravel(fixednid, order='F') - 2)) #CURRENTLY A 1D ARRAY (used for sparse later) # PREPARE FE ANALYSIS nele = nelx * nely * nelz ndof = 3 * (nelx + 1) * (nely + 1) * (nelz + 1) F = csr_matrix((-1 * np.ones(np.shape(loaddof)), (loaddof-1, np.ones(np.shape(loaddof))-1)), shape=(ndof, 1)) U = np.zeros((ndof, 1)) freedofs = np.setdiff1d(np.arange(ndof) + 1, fixeddof) KE = lk_H8(nu) nodegrd = np.reshape(np.arange((nely + 1) * (nelx + 1)) + 1, (nely + 1, nelx + 1), order = 'F') nodeids = np.reshape(nodegrd[0:-1, 0:-1], (nely * nelx, 1), order='F') nodeidz = np.arange(0, (nelz - 1) * (nely + 1) * (nelx + 1) + 1, (nely + 1) * (nelx + 1))[np.newaxis] nodeids = (np.matlib.repmat(nodeids, np.shape(nodeidz)[0], np.shape(nodeidz)[1]) + np.matlib.repmat(nodeidz, np.shape(nodeids)[0], np.shape(nodeids)[1])) edofVec = (3 * np.ravel(nodeids, order='F') + 1)[np.newaxis] edofMat = (np.matlib.repmat(edofVec.T, 1, 24) + np.matlib.repmat(np.concatenate(( np.array([0, 1, 2]), 3*nely + np.array([3, 4, 5, 0, 1, 2]), np.array([-3, -2, -1]), 3*(nely + 1)*(nelx + 1) + np.concatenate(( np.array([0, 1, 2]), 3*nely+np.array([3, 4, 5, 0, 1, 2]), np.array([-3, -2, -1]) )) )), nele, 1)) iK = np.reshape(np.kron(edofMat, np.ones((24, 1))).T, (24 * 24 * nele, 1), order='F') jK = np.reshape(np.kron(edofMat, np.ones((1, 24))).T, (24 * 24 * nele, 1), order='F') # PREPARE FILTER iH = np.ones((int(nele * (2 * (np.ceil(rmin) - 1) + 1)** 2), 1)) iHdummy = [] jH = np.ones(np.shape(iH)) jHdummy = [] sH = np.zeros(np.shape(iH)) sHdummy = [] k = 0 ##################### for k1 in np.arange(nelz)+1: for i1 in np.arange(nelx)+1: for j1 in np.arange(nely)+1: e1 = (k1 - 1) * nelx * nely + (i1 - 1) * nely + j1 for k2 in np.arange(max(k1 - (np.ceil(rmin) - 1), 1), min(k1 + (np.ceil(rmin) - 1), nelz) + 1): for i2 in np.arange(max(i1 - (np.ceil(rmin) - 1), 1), min(i1 + (np.ceil(rmin) - 1), nelx) + 1): for j2 in np.arange(max(j1 - (np.ceil(rmin) - 1), 1), min(j1 + (np.ceil(rmin) - 1), nely) + 1): e2 = (k2 - 1) * nelx * nely + (i2 - 1) * nely + j2 if k < np.size(iH): iH[k] = e1 jH[k] = e2 sH[k] = max(0, rmin - np.sqrt((i1 - i2)** 2 + (j1 - j2)** 2 + (k1 - k2)** 2)) else: iHdummy.append(e1) jHdummy.append(e2) sHdummy.append(max(0, rmin - np.sqrt((i1 - i2)** 2 + (j1 - j2)** 2 + (k1 - k2)** 2))) k = k + 1 ##################### iH = np.concatenate((iH, np.array(iHdummy).reshape((len(iHdummy), 1)))) jH = np.concatenate((jH, np.array(jHdummy).reshape((len(jHdummy), 1)))) sH = np.concatenate((sH, np.array(sHdummy).reshape((len(sHdummy), 1)))) H = csr_matrix((np.squeeze(sH), (np.squeeze(iH.astype(int)) - 1, np.squeeze(jH.astype(int)) - 1))) Hs = csr_matrix.sum(H, axis=0).T if heaviside == 0: # INITIALIZE ITERATION x = np.tile(volfrac, [nelz, nely, nelx]) xPhys = x ######## AMFILTER CALL TYPE 1 ######### xPrint, _ = AMFilter3D.AMFilter(xPhys, baseplate) ################################## loop = 0 change = 1 # START ITERATION while change > tolx and loop < maxloop: loop = loop + 1 # FE ANALYSIS sK = np.reshape(np.ravel(KE, order='F')[np.newaxis].T @ (Emin+xPrint.transpose(0,2,1).ravel(order='C')[np.newaxis]**penal*(E0-Emin)),(24*24*nele,1),order='F') K = csr_matrix((np.squeeze(sK), (np.squeeze(iK.astype(int)) - 1, np.squeeze(jK.astype(int)) - 1))) K = (K + K.T) / 2 U[freedofs - 1,:] = spsolve(K[freedofs - 1,:][:, freedofs - 1], F[freedofs - 1,:])[np.newaxis].T # OBJECTIVE FUNCTION AND SENSITIVITY ANALYSIS ce = np.reshape(np.sum((U[edofMat - 1].squeeze() @ KE) * U[edofMat - 1].squeeze(), axis=1), (nelz, nelx, nely), order = 'C').transpose(0,2,1) c = np.sum(np.sum(np.sum(Emin + xPrint ** penal * (E0 - Emin) * ce))) # REPLACE xPhys with xPrint dc = -penal * (E0 - Emin) * (xPrint ** (penal - 1)) * ce # REPLACE xPhys with xPrint dv = np.ones((nelz, nely, nelx)) ######### AMFILTER CALL TYPE 2 ######### xPrint, senS = AMFilter3D.AMFilter(xPhys, baseplate, dc, dv) dc = senS[0] dv = senS[1] ################################### # FILTERING AND MODIFICATION OF SENSITIVITIES dc = np.array((H @ (dc.transpose(0,2,1).ravel(order='C')[np.newaxis].T/Hs))).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1) dv = np.array((H @ (dv.transpose(0,2,1).ravel(order='C')[np.newaxis].T/Hs))).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1) # OPTIMALITY CRITERIA UPDATE l1 = 0 l2 = 1e9 move = 0.05 while (l2 - l1) / (l1 + l2) > 1e-3 and l2>1e-9: lmid = 0.5 * (l2 + l1) xnew_step1 = np.minimum(x + move, x * np.sqrt(-dc / dv / lmid)) xnew_step2 = np.minimum(1, xnew_step1) xnew_step3 = np.maximum(x - move, xnew_step2) xnew = np.maximum(0, xnew_step3) xPhys = np.array((H @ (xnew.transpose(0,2,1).ravel(order='C')[np.newaxis].T)/Hs)).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1) ######### AMFILTER CALL TYPE 1 ###### xPrint, _ = AMFilter3D.AMFilter(xPhys, baseplate) ################################# if np.sum(xPrint.ravel(order='C')) > volfrac * nele: # REPLACE xPhys with xPrint l1 = lmid else: l2 = lmid change = np.max(np.absolute(np.ravel(xnew, order='F') - np.ravel(x, order='F'))) x = xnew print("it.: {0} , ch.: {1:.3f}, obj.: {2:.4f}, Vol.: {3:.3f}".format( loop, change, c, np.mean(xPrint.ravel(order='C')))) elif heaviside == 1: beta = 1 # INITIALIZE ITERATION x = np.tile(volfrac, [nelz, nely, nelx]) xTilde = x xPhys = 1 - np.exp(-beta * xTilde) + xTilde * np.exp(-beta) ######## AMFILTER CALL TYPE 1 ######### xPrint, _ = AMFilter3D.AMFilter(xPhys, baseplate) ################################## loop = 0 loopbeta = 0 change = 1 # START ITERATION while change > tolx and loop < maxloop: loop = loop + 1 loopbeta = loopbeta + 1 # FE ANALYSIS sK = np.reshape(np.ravel(KE, order='F')[np.newaxis].T @ (Emin+xPrint.transpose(0,2,1).ravel(order='C')[np.newaxis]**penal*(E0-Emin)),(24*24*nele,1),order='F') K = csr_matrix((np.squeeze(sK), (np.squeeze(iK.astype(int)) - 1, np.squeeze(jK.astype(int)) - 1))) K = (K + K.T) / 2 U[freedofs - 1,:] = spsolve(K[freedofs - 1,:][:, freedofs - 1], F[freedofs - 1,:])[np.newaxis].T # OBJECTIVE FUNCTION AND SENSITIVITY ANALYSIS ce = np.reshape(np.sum((U[edofMat - 1].squeeze() @ KE) * U[edofMat - 1].squeeze(), axis=1), (nelz, nelx, nely), order = 'C').transpose(0,2,1) c = np.sum(np.sum(np.sum(Emin + xPrint ** penal * (E0 - Emin) * ce))) # REPLACE xPhys with xPrint dc = -penal * (E0 - Emin) * (xPrint ** (penal - 1)) * ce # REPLACE xPhys with xPrint dv = np.ones((nelz, nely, nelx)) ######### AMFILTER CALL TYPE 2 ######### xPrint, senS = AMFilter3D.AMFilter(xPhys, baseplate, dc, dv) dc = senS[0] dv = senS[1] ################################### # FILTERING AND MODIFICATION OF SENSITIVITIES dx = beta * np.exp(-beta * xTilde) + np.exp(-beta) dc = np.array((H @ (dc.transpose(0, 2, 1).ravel(order='C')[np.newaxis].T * dx.transpose(0, 2, 1).ravel(order='C')[np.newaxis].T /Hs))).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1) dv = np.array((H @ (dv.transpose(0, 2, 1).ravel(order='C')[np.newaxis].T * dx.transpose(0, 2, 1).ravel(order='C')[np.newaxis].T /Hs))).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1) # OPTIMALITY CRITERIA UPDATE l1 = 0 l2 = 1e9 move = 0.05 while (l2 - l1) / (l1 + l2) > 1e-3: lmid = 0.5 * (l2 + l1) xnew_step1 = np.minimum(x + move, x * np.sqrt(-dc / dv / lmid)) xnew_step2 = np.minimum(1, xnew_step1) xnew_step3 = np.maximum(x - move, xnew_step2) xnew = np.maximum(0, xnew_step3) xTilde = np.array((H @ (xnew.transpose(0,2,1).ravel(order='C')[np.newaxis].T)/Hs)).reshape((nelz, nelx, nely), order = 'C').transpose(0,2,1) xPhys = 1 - np.exp(-beta * xTilde) + xTilde * np.exp(-beta) ######### AMFILTER CALL TYPE 1 ###### xPrint, _ = AMFilter3D.AMFilter(xPhys, baseplate) ################################# if np.sum(xPrint.ravel(order='C')) > volfrac * nele: # REPLACE xPhys with xPrint l1 = lmid else: l2 = lmid change = np.max(np.absolute(np.ravel(xnew, order='F') - np.ravel(x, order='F'))) x = xnew if beta < 512 and (loopbeta >= 50 or change <= 0.01): beta = 2 * beta loopbeta = 0 change = 1 print("Parameter beta increased to {0}. \n".format(beta)) print("it.: {0} , ch.: {1:.3f}, obj.: {2:.4f}, Vol.: {3:.3f}".format( loop, change, c, np.mean(xPrint.ravel(order='C')))) return xPrint
df.index = titles a = df.sum(axis=1) mask2 = a>0 return df.loc[mask2,:] def create_sparse_matrix(list_of_followers): list_of_strings = [] list_of_titles = [] for ls in list_of_followers: x = ' '.join([ str(x) for x in ls['TwitterFollowers']]) list_of_strings.append(x) list_of_titles.append(ls['Title']) return list_of_strings, list_of_titles strings,titles = create_sparse_matrix(practice) final = CV.fit_transform(strings) a = csr_matrix.sum(final, axis=0) a = np.array(a).reshape(a.shape[1],) sparse = final.toarray() mask = a>1 u=np.where(mask)[0] sparse_mat = final.tocsc()[:,u] df = sparse_mat.toarray() attempt = get_truncated_matrix(mat, 25000) df = pd.DataFrame(attempt) item_matrix=gl.SFrame(df) item_matrix.save("IHavebeensaved.gl")
def GraphSampler(prior, approximation, typesampler, sigma, c, t, tau, gamma, size_x, type_prior_x, dim_x, a_t=200, b_t=1, print_=True, **kwargs): start = time.time() # sample weights w, w0, beta output = weight.WeightsSampler(prior, approximation, t, sigma, c, tau, **kwargs) w = kwargs['w'] if 'w' in kwargs else output[0] w0 = kwargs['w0'] if 'w0' in kwargs else output[1] beta = kwargs['beta'] if 'beta' in kwargs else output[2] size = len(w) # sample locations x = kwargs['x'] if 'x' in kwargs else loc.LocationsSampler( size_x, size, type_prior_x, dim_x) # sample graph if typesampler == "naive": [G, w, x, size] = NaiveSampler(w, x, gamma, dim_x) if typesampler == "layers": K = kwargs['K'] if 'K' in kwargs else 100 [G, w, x, size] = SamplerLayers_optim(w, x, gamma, size_x, K) end = time.time() deg = np.array(list(dict(G.degree()).values())) if print_ is True: print('time to produce sample: ', round((end - start) / 60, 2), ' min') print('number of active nodes: ', sum(deg > 0)) print('total number of nodes L: ', len(deg)) G.graph['prior'] = prior G.graph['sigma'] = sigma G.graph['c'] = c G.graph['t'] = t G.graph['tau'] = tau G.graph['gamma'] = gamma G.graph['size_x'] = size_x G.graph['a_t'] = a_t G.graph['b_t'] = b_t # set nodes attributes: w, w0, beta, x, u z = (size * sigma / t) ** (1 / sigma) if prior == 'singlepl' else \ (size * tau * sigma ** 2 / (t * c ** (sigma * (tau - 1)))) ** (1 / sigma) G.graph['z'] = z u = tp.tpoissrnd(z * w0) d = {k: [] for k in G.nodes} for i in G.nodes(): d[i] = {'w': w[i], 'w0': w0[i], 'beta': beta[i], 'x': x[i], 'u': u[i]} nx.set_node_attributes(G, d) # set graph attributes: ind (upper triangular matrix of neighbors of nodes) and selfedge (list of nodes w/ selfedge) ind = {k: [] for k in G.nodes} for i in G.nodes: for j in G.adj[i]: if j >= i: ind[i].append(j) selfedge = [i in ind[i] for i in G.nodes] selfedge = list(compress(G.nodes, selfedge)) G.graph['ind'] = ind G.graph['selfedge'] = selfedge # computing "distance" matrix p_ij = 1 / ((1 + |x_i-x_j|) ** gamma) p_ij = aux.space_distance(x, gamma) if gamma != 0 else np.ones( (size, size)) G.graph['distances'] = p_ij # computing counts upper triangular matrix n n_out = up.update_n(w, G, size, p_ij, ind, selfedge) n = n_out[0] G.graph[ 'counts'] = n # for the counts, it would be nice to set up a nx.MultiGraph, but some algorithms don't work # on these graphs, so for the moment I'll assign n as attribute to the whole graph rather then the single nodes sum_n = np.array( csr_matrix.sum(n, axis=0) + np.transpose(csr_matrix.sum(n, axis=1)))[0] G.graph['sum_n'] = sum_n sum_fact_n = n_out[1] G.graph['sum_fact_n'] = sum_fact_n # attach log posterior of the graph as attribute adj = n > 0 # ### SPEED UP - when updating x alone # ind = np.argsort(deg) # index = ind[0:len(ind) - 1] # log_post = aux.log_post_logwbeta_params(prior, sigma, c, t, tau, w, w0, beta, n, u, p_ij, a_t, b_t, gamma, sum_n, # adj, x, index=index) # ### SPEED UP - when updating x alone log_post_param = aux.log_post_params(prior, sigma, c, t, tau, w0, beta, u, a_t, b_t) log_post = aux.log_post_logwbeta_params(prior, sigma, c, t, tau, w, w0, beta, n, u, p_ij, a_t, b_t, gamma, sum_n, adj, x) G.graph['log_post'] = log_post G.graph['log_post_param'] = log_post_param return G
def mcmc(G, iter, nburn, w0=False, beta=False, n=False, u=False, sigma=False, c=False, t=False, tau=False, x=False, hyperparams=False, wnu=False, all=False, sigma_sigma=0.01, sigma_c=0.01, sigma_t=0.01, sigma_tau=0.01, sigma_x=0.01, a_t=200, b_t=1, epsilon=0.01, R=5, w_inference='HMC', save_every=1000, init='none', index=None, type_prop_x='tNormal'): size = G.number_of_nodes() prior = G.graph['prior'] if 'prior' in G.graph else print( 'You must specify a prior as attribute of G') gamma = G.graph['gamma'] if 'gamma' in G.graph else print( 'You must specify spatial exponent gamma as G attribute') size_x = G.graph['size_x'] if 'size_x' in G.graph else print( 'You must specify size_x as attribute of G') sigma_est, c_est, t_est, tau_est, w_est, w0_est, beta_est, n_est, x_est, p_ij_est, u_est, z_est, ind, selfedge = \ init_var(G, size, gamma, init, w0, beta, n, u, sigma, c, t, tau, x, hyperparams, wnu, all, prior, a_t, b_t, size_x) accept_params = [0] accept_hmc = 0 accept_distance = [0] rate = [0] rate_p = [0] step = 100 nadapt = 1000 sigma_prev = sigma_est[-1] c_prev = c_est[-1] t_prev = t_est[-1] tau_prev = tau_est[-1] w_prev = w_est[-1] w0_prev = w0_est[-1] beta_prev = beta_est[-1] n_prev = n_est[-1] x_prev = x_est[-1] p_ij_prev = p_ij_est[-1] u_prev = u_est[-1] z_prev = z_est[-1] sum_n = np.array( csr_matrix.sum(n_prev, axis=0) + np.transpose(csr_matrix.sum(n_prev, axis=1)))[0] adj = n_prev > 0 log_post_param_prev = aux.log_post_params(prior, sigma_prev, c_prev, t_prev, tau_prev, w0_prev, beta_prev, u_prev, a_t, b_t) log_post_prev = aux.log_post_logwbeta_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w_prev, w0_prev, beta_prev, n_prev, u_prev, p_ij_prev, a_t, b_t, gamma, sum_n, adj, x_prev, log_post_par=log_post_param_prev) log_post_param_est = [log_post_param_prev] log_post_est = [log_post_prev] for i in range(iter): # update hyperparameters if at least one of them demands the update if sigma is True or c is True or t is True or tau is True: sigma_prev, c_prev, t_prev, tau_prev, z_prev, accept_param_prev, log_post_param_prev, rate_p_prev \ = up.update_params(prior, sigma_prev, c_prev, t_prev, tau_prev, z_prev, w0_prev, beta_prev, u_prev, log_post_param_prev, accept_params[-1], sigma=sigma, c=c, t=t, tau=tau, sigma_sigma=sigma_sigma, sigma_c=sigma_c, sigma_t=sigma_t, sigma_tau=sigma_tau, a_t=a_t, b_t=b_t) accept_params.append(accept_param_prev) rate_p.append(rate_p_prev) # if you only have to update hyperparams, then log_post = log_post_param, otherwise you need to update that if w0 is True or n is True or u is True or x is True: log_post_prev = aux.log_post_logwbeta_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w_prev, w0_prev, beta_prev, n_prev, u_prev, p_ij_prev, a_t, b_t, gamma, sum_n, adj, x_prev, log_post_par=log_post_param_prev) if (i + 1) % save_every == 0 and i != 0: sigma_est.append(sigma_prev) c_est.append(c_prev) t_est.append(t_prev) tau_est.append(tau_prev) z_est.append(z_prev) log_post_param_est.append(log_post_param_prev) if w0 is True or n is True or u is True or x is True: log_post_est.append(log_post_prev) if i % 1000 == 0: print('update hyperparams iteration ', i) print('acceptance rate hyperparams = ', round(accept_params[-1] / (i + 1) * 100, 1), '%') if (i % step) == 0 and i != 0 and i < nburn: if sigma is True: sigma_sigma = aux.tune(accept_params, sigma_sigma, step) if c is True: sigma_c = aux.tune(accept_params, sigma_c, step) if t is True: sigma_t = aux.tune(accept_params, sigma_t, step) if tau is True: sigma_tau = aux.tune(accept_params, sigma_tau, step) # update w and beta if at least one of them is True if w0 is True: if w_inference == 'gibbs': w_prev, w0_prev = up.gibbs_w(w_prev, beta_prev, sigma_prev, c_prev, z_prev, u_prev, n_prev, p_ij_prev, gamma, sum_n) log_post_param_prev = aux.log_post_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w0_prev, beta_prev, u_prev, a_t, b_t) log_post_prev = aux.log_post_logwbeta_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w_prev, w0_prev, beta_prev, n_prev, u_prev, p_ij_prev, a_t, b_t, gamma, sum_n, adj, x_prev, log_post=log_post_param_prev) if (i + 1) % save_every == 0 and i != 0: w_est.append(w_prev) w0_est.append(w0_prev) beta_est.append(beta_prev) log_post_est.append(log_post_prev) log_post_param_est.append(log_post_param_prev) if i % 1000 == 0 and i != 0: print('update w iteration ', i) if w_inference == 'HMC': w_prev, w0_prev, beta_prev, accept_hmc, rate_prev, log_post_prev, log_post_param_prev \ = up.HMC_w(prior, w_prev, w0_prev, beta_prev, n_prev, u_prev, sigma_prev, c_prev, t_prev, tau_prev, z_prev, gamma, p_ij_prev, a_t, b_t, epsilon, R, accept_hmc, size, sum_n, adj, x_prev, log_post_prev, log_post_param_prev, update_beta=beta) rate.append(rate_prev) if (i + 1) % save_every == 0 and i != 0: w_est.append(w_prev) w0_est.append(w0_prev) beta_est.append(beta_prev) log_post_est.append(log_post_prev) log_post_param_est.append(log_post_param_prev) if i % 100 == 0 and i != 0: # if i < nadapt: if i >= step: # epsilon = np.exp(np.log(epsilon) + 0.01 * (np.mean(rate) - 0.6)) epsilon = np.exp( np.log(epsilon) + 0.01 * (np.mean(rate[i - step:i]) - 0.6)) if i % 1000 == 0: print('update w and beta iteration ', i) print('acceptance rate HMC = ', round(accept_hmc / (i + 1) * 100, 1), '%') print('epsilon = ', epsilon) # update n step_n = 25 if n is True and (i + 1) % step_n == 0: n_prev, rubbish = up.update_n(w_prev, G, size, p_ij_prev, ind, selfedge) sum_n = np.array( csr_matrix.sum(n_prev, axis=0) + np.transpose(csr_matrix.sum(n_prev, axis=1)))[0] log_post_prev = aux.log_post_logwbeta_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w_prev, w0_prev, beta_prev, n_prev, u_prev, p_ij_prev, a_t, b_t, gamma, sum_n, adj, x_prev, log_post_par=log_post_param_prev) if (i + 1) % save_every == 0 and i != 0: n_est.append(n_prev) log_post_param_est.append(log_post_param_prev) log_post_est.append(log_post_prev) if i % 1000 == 0: print('update n iteration ', i) # update u if u is True: u_prev = up.posterior_u(z_prev * w0_prev) log_post_param_prev = aux.log_post_params(prior, sigma_prev, c_prev, t_prev, tau_prev, w0_prev, beta_prev, u_prev, a_t, b_t) log_post_prev = aux.log_post_logwbeta_params( prior, sigma_prev, c_prev, t_prev, tau_prev, w_prev, w0_prev, beta_prev, n_prev, u_prev, p_ij_prev, a_t, b_t, gamma, sum_n, adj, x_prev, log_post_par=log_post_param_prev) if (i + 1) % save_every == 0 and i != 0: u_est.append(u_prev) log_post_param_est.append(log_post_param_prev) log_post_est.append(log_post_prev) if i % 1000 == 0: print('update u iteration ', i) step_x = 1 if x is True and (i + 1) % step_x == 0: x_prev, p_ij_prev, accept_distance_prev, log_post_prev = \ up.update_x(x_prev, w_prev, gamma, p_ij_prev, n_prev, sigma_x, accept_distance[-1], prior, sigma_prev, c_prev, t_prev, tau_prev, w0_prev, beta_prev, u_prev, a_t, b_t, sum_n, adj, log_post_prev, log_post_param_prev, index, size_x, type_prop_x) accept_distance.append(accept_distance_prev) if (i + 1) % save_every == 0 and i != 0: p_ij_est.append(p_ij_prev) x_est.append(x_prev) log_post_param_est.append(log_post_param_prev) log_post_est.append(log_post_prev) if i % 1000 == 0: print('update x iteration ', i) print('acceptance rate x = ', round(accept_distance[-1] * 100 * step_x / (i + 1), 1), '%') print('sigma_x = ', sigma_x) if (i % (step / step_x)) == 0 and i != 0 and i < nburn: sigma_x = aux.tune(accept_distance, sigma_x, int(step / step_x)) if gamma != 0: return w_est, w0_est, beta_est, sigma_est, c_est, t_est, tau_est, n_est, u_est, \ log_post_param_est, log_post_est, p_ij_est, x_est else: return w_est, w0_est, beta_est, sigma_est, c_est, t_est, tau_est, n_est, u_est, \ log_post_param_est, log_post_est, p_ij_est
class TopicSpecificRank: """Similar to PageRank but the teleport set is a subset(related topics) of all nodes. ... Parameters ---------- beta : float Probability with which teleports will occur edges : collections.defaltdict(list) Adjacency list containing information connections in web-graph epsilon : float A small value and total error in ranks should be less than epsilon max_iterations : int Maximum number of times to apply power iteration node_num : int Number of nodes in the web-graph PageRank_vector : numpy.ndarray [1-dimensional, dtype=float] Contains PageRank of each node in the web-graph order : {'beta', 'edges', 'epsilon', 'max_iterations', 'node_num', 'PageRank_vector'} Parameters follows precisely the above order. None of the parameter is optional. Methods ------- get_similarTopicPages() Classifies topics pages in different classes. matrix_get_initailRankMatrix() Initailises the topicSpecificRank Matrix. matrix_get_topicSpecificGoogleMatrix() Creates the Google Matrix which is used in power iteration. matrix_get_topicSpecificRank() Applies power iteration on Google Matrix and Initial Rank Matrix to get TopicSpecificRank Matrix. list_get_topicSpecificRank() Alternative method for power iteration which used much less RAM. topicSpecificRank() Utility function which call other functions and returns rank vector. """ def __init__(self, beta, edges, epsilon, max_iterations, node_num, PageRank_vector): self.beta = beta self.edges = edges self.epsilon = epsilon self.node_num = node_num self.PageRank_vector = PageRank_vector self.MAX_ITERATIONS = max_iterations def get_similarTopicPages(self): """Classifies topics pages in different classes. [INCOMPLETE] Write your own implementation to classify pages into topics. ... Parameters ---------- None [May add more if required.] Returns ------- lol_of_topic_pages : list of list of int Each inner list contains the related pages. Each page belongs to only one inner list. Outer list contains all such inner lists. """ pass def matrix_get_initailRankMatrix(self): """Initailises the topicSpecificRank Matrix. Parameters ---------- None Returns ------- initial_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), n is `node_num`] Ranks are distributed equally among all pages, initially. """ initial_rank_list = [1/(self.node_num) for i in range(self.node_num)] initial_rank_vector = SparseMatrix(np.matrix(initial_rank_list). transpose()) return initial_rank_vector def matrix_get_topicSpecificGoogleMatrix(self, related_pages): """Creates the Google Matrix which is used in power iteration. Parameters ---------- related_pages : list of int Contains list of pages which belong to the same topic. Returns ------- google_matrix : scipy.sparse.csr_matrix [shape = (n x n), n is `node_num`] It contains proportion of rank that will propagate from a page to another page. Proportion of rank depends on degree of node and leaked rank. """ related_set_size = len(related_pages) teleport_matrix_row = [] teleport_matrix_col = [] teleport_matrix_data = [] for related_node in related_pages: for node in range(self.node_num): teleport_matrix_col.append(node) teleport_matrix_row.append(related_node) teleport_matrix_data.append( (1 - self.beta) / related_set_size) teleport_matrix = SparseMatrix((teleport_matrix_data, ( teleport_matrix_row, teleport_matrix_col)), shape = (self.node_num, self.node_num)) connection_matrix_row = [] connection_matrix_col = [] connection_matrix_data = [] for parent_node in range(self.node_num): for child_node in self.edges[parent_node]: connection_matrix_col.append(parent_node) connection_matrix_row.append(child_node) connection_matrix_data.append( self.beta / (len(self.edges[parent_node]))) connection_matrix = SparseMatrix((connection_matrix_data, ( connection_matrix_row, connection_matrix_col)), shape = (self. node_num, self.node_num)) google_matrix = connection_matrix + teleport_matrix return google_matrix def matrix_get_topicSpecificRank(self, teleport_set, initial_rank_vector, google_matrix): """Calculates TopicSpecificRank of each node taking some related_pages as `teleport_set`. This method works by applying power iteration until convergence or till iterations reach `MAX_ITERATIONS`, whichever happens first. [USAGE WARNING] : If graph is large, then sparse matrix may become huge and use up the entire RAM(which is not a condition to be in). ... Parameters ---------- teleport_set : list of int List of pages to which a random walker in the web-graph can teleport to. In TopicSpecificRank this set corresponds to pages of same topic. initial_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), n is `node_num`] Ranks are distributed equally among all pages, initially. google_matrix : scipy.sparse.csr_matrix [shape = (n x n), n is `node_num`] It contains proportion of rank that will propagate from a page to another page. Returns ------- final_rank_vector : scipy.sparse.csr_matrix [shape = (n x 1), n is `node_num`] Contains TopicSpecificRank of each node in the web-graph. """ iterations = 0 diff = math.inf teleport_set_size = len(teleport_set) final_rank_vector = SparseMatrix(np.zeros(self.node_num).transpose()) while(iterations < self.MAX_ITERATIONS and diff > self.epsilon): new_rank_vector = google_matrix * initial_rank_vector leaked_rank = (1-SparseMatrix.sum(new_rank_vector))/ teleport_set_size leaked_rank_vector = SparseMatrix(np.array([leaked_rank if node in teleport_set else 0 for node in range(self.node_num)])). transpose() final_rank_vector = new_rank_vector + leaked_rank_vector diff = SparseMatrix.sum( abs(final_rank_vector - initial_rank_vector)) initial_rank_vector = final_rank_vector iterations += 1 print("At iteration: " + str(iterations))
def tfpredic(testWord): # database 1 : คำถามที่ต้องตอบ text_list = [ '''ฝุ่นเยอะ มหาลัยยังเปิดเรียนใช่ไหม''', '''กำหนดการรับนักศึกษา''', '''ปีนี้คณะรับนักศึกษากี่คน''', '''สอบเข้า''', '''น้ำท่วม เปิดเรียนตามปกติ''', '''ย้ายคณะ''', '''น้ำท่วมมหาลัยหยุด''', '''โอนหน่วยกิต''', '''ประชุมอาเซียน มหาลัยปิดเรียน''', '''ลาพักการเรียน''', '''เทียบโอนหน่วยกิต''', '''เกียรตินิยม''', '''เปลี่ยนชื่อ''', '''ถามเรื่องกู้กยศ''', '''โควิดระบาด มหาลัยหยุด''', '''ไวรัสระบาด มหาลัยปิดเรียน''', '''รับนักศึกษากี่คนปีนี้''', '''ปีนี้รับนักศึกษาเยอะ''', '''ปีนี้รับ นศ เยอะ''', '''วันรับปริญญามีเรียนหรือเปล่า''', '''งานรับปริญญา มหาลัยปิดเรียนกี่วัน''', '''มหาลัยหยุดวันรับปริญญาถึงวันไหน''', '''สำนักทะเบียนวันนี้เปิดทำการตามปกติ''', '''สำนักทะเบียนพรุ่งนี้เปิดทำการตามปกติ''', '''สหกิจเทอมนี้ไปทำไม่ได้ ทำยังไง''', '''ฝึกงานได้ช่วงไหน''', '''หนังสือรับรองจบได้เมื่อไหร่''', '''สหกิจศึกษาเลื่อน''', '''สหกิจยังทำได้''', '''วันที่ สำนักทะเบียนทำการปกติ''', '''กยศ''', '''กู้กยศ''', '''ก.ย.ศ.''', '''กยศ.''', '''สอบ''', '''สอบโทอิค''', '''จ่ายเงิน''', '''ไปฝึกงานไม่ได้เทอมนี้ ควรทำอย่างไร''', '''น้ำท่วม หยุดเรียน''', '''กำหนดการต่างๆของมหาลัยยังเหมือนเดิมใช่''', '''วันที่ ปิดปรับปรุงเว็บ reg ''', '''ปิดปรับปรุงเว็บสำนักทะเบียนเมื่อไหร่''', '''เลื่อนระยะเวลาสอบ''', '''เลื่อนปิดภาคเรียน''', '''ซัมเมอร์ยังลงทะเบียนวันเดิม''', '''ภาคเรียนที่ 3 ยังลงทะเบียนวันเดิม''', '''ภาคเรียนที่สามวันลงทะเบียนเหมือนเดิม''', '''เลื่อนวันปิดเทอม''', '''มหาลัยเรียนออนไลน์อย่างไม่มีกำหนด''', '''หน่วยกิต''', testWord ] # database 2 : คำถามที่ไม่ต้องตอบ text_list2 = [ '''คณะวิทยาศาสต์อยู่ตรงไหน''', '''เซเว่นไปทางไหน''', '''ไอทีมีเซเว่นไหม''', '''ในมอมีเซเว่นที่ไหนบ้าง''', '''โรงพยาบาลลาดกระบังไปทางไหน''', '''เย็นนี้ทานข้าวไหน''', '''วันนี้อาจารย์มีประชุม''', '''ระบบช้ามาก''', '''ขอดูเกรดเทอมนี้''', '''ลงทะเบียนไปแล้วกี่หน่วยกิต''', '''เหลืออีกหน่วยกิตกว่าจะจบ''', '''เหลือวิชาเลือกต้องลงอีกกี่ตัว''', '''หอในไปทางไหน''', '''เมื่อไหร่โควิดจะหาย''', '''เกรดจะออกครบทุกวิชาเมื่อไหร่''', '''วิชา อาจารย์ส่งเกรดหรือยัง''', '''สำนักคอมไปทางไหน''', '''เมื่อไหร่ระบบคำนวณเกรดจะเสร็จ''', '''เว็บล่มบ่อย เป็นอะไรนักหนา''', '''ร้านถ่ายเอกสารอยู่ไหน''', '''คณะวิศวะไปทางไหน''', '''คณะคุรุไปทางไหน''', '''แล้วไปเที่ยวกัน''', '''ธนาคารอยู่ไหน''', '''เกรดจะออกเมื่อไหร่''', '''ขอดูตารางเรียนส่วนบุคคล''', '''ขอดูตารางสอบส่วนตัว''', '''สถาปัตยกรรมศาสตร์ไปทางไหน''', '''ตึกพระเทพไปทางไหน''', '''ตึกพระจอมเกล้าอยู่ตรงไหน''', '''ตึกกลางน้ำอยู่ตรงไหน''', '''gpax''', '''ผลการเรียน''', '''ห้องน้ำไปทางไหนคะ''', '''หิวข้าวอยากกินข้าวมากๆๆๆ''', '''คณะไอทีอยู่ตรงไหน''', '''คะแนนสอบออกเมื่อไหร่''', '''มีรายวิชาอะไรส่งเกรดแล้วบ้าง''', '''อาจารย์ ออกจากคณะหรือยัง''', '''บริหารอยู่ตรงไหน''', '''เทคโนเกษตรอยู่ตรงไหน''', '''ส่งงานอาจารย์ ที่ไหน''', '''มีช่องทางติดต่ออาจารย์''', '''ตึกศิลปศาสตร์อยู่ตรงไหน''', '''ห้องอธิการบดีอยู่ที่ไหน''', '''ห้องอาจารย์อยู่ที่ไหน''', '''ผลการเรียนเทอมล่าสุด''', '''กินข้าวไปเที่ยวอาบน้ำ''', '''กินข้าวกันไหมเย็นนี้อยากกินมาก''', '''ตึกโหลอยู่ไหน''', testWord ] tokens_list = [split_word(txt) for txt in text_list] #รวม 2 List tokens_list_j = [','.join(tkn) for tkn in tokens_list] tvec = TfidfVectorizer(analyzer=lambda x: x.split(','), ) t_feat = tvec.fit_transform(tokens_list_j) tokens_list2 = [split_word(txt) for txt in text_list2] #ตัดแต่ละประโยคให้เป็นคำ tokens_list_j2 = [ ','.join(tkn) for tkn in tokens_list2 ] #นำแต่ละคำมารวมกัน ให้คิดเป็นแต่ละประโยค แล้วคั่นด้วย , tvec2 = TfidfVectorizer(analyzer=lambda x: x.split(','), ) t_feat2 = tvec.fit_transform(tokens_list_j2) #หาค่า tfidf ของแต่ละคำในประโยค score1 = csr_matrix.sum(t_feat[-1, :]) score2 = csr_matrix.sum(t_feat2[-1, :]) if score1 < score2: return 1 # database 1 : คำถามที่ต้องตอบ else: return 2 # database 2 : คำถามที่ไม่ต้องตอบ
def char_counts(df): #This function creates columns for each character and counts the times it #appears during each study session/day print 'Calculating time since character last read, etc...' #Need to create corpus of characters found in all text_read from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(decode_error='strict', analyzer='char') corpus = df.loc[:, 'text_read'] dtm = vectorizer.fit_transform(corpus) import numpy as np from itertools import chain import datetime from scipy.sparse import csr_matrix n = df.shape[0] df.loc[:, 'percent_seen'] = 0.0 df.loc[:, 'mean_days_since'] = 0.0 df.loc[:, 'mean_term_freq'] = 0.0 for i in range(1, n): #cycle through all rows except first row ##Get percent of characters not seen in text so far prior_non_zero = dtm[:i, :].nonzero( ) #Find non-zero values in sparse matrix in (i-1) records before_chars = np.unique( prior_non_zero[1] ) #Get list of all characters that have been seen so far current_chars = np.sort( dtm[i, :].nonzero() [1]) #Find non-zero characters in current record as column #'s #http://stackoverflow.com/questions/28901311/numpy-find-index-of-elements-in-one-array-that-occur-in-another-array matching_current_index = np.where(np.in1d(current_chars, before_chars))[0] df.loc[i, 'percent_seen'] = float( matching_current_index.shape[0]) / float(current_chars.shape[0]) ##Get mean days since characters last read (for those already seen in text) #http://stackoverflow.com/questions/10252766/python-numpy-get-array-locations-of-a-list-of-values #http://stackoverflow.com/questions/11860476/how-to-unnest-a-nested-list #gets list of tuple arrays (1 array per char in matching chars) where each array gives the indices of #prior_non_zero where that character can be found matching_chars = current_chars[matching_current_index] prior_array_indices = [ np.where(prior_non_zero[1] == k) for k in list(matching_chars) ] prior_array_indices = list(chain(*prior_array_indices)) last_date_indices = map(lambda x: max(x), prior_array_indices) last_date_rows = prior_non_zero[0][last_date_indices] current_date = df.loc[i, 'date'] days_since_seen = map(lambda x: current_date - x, df.loc[last_date_rows, 'date']) df.loc[i, 'mean_days_since'] = ( sum(days_since_seen, datetime.timedelta(0)).total_seconds() / 86400.0 / (len(days_since_seen))) ##Get mean frequency of document terms in the corpus so far # NOT including the text read during the study session denominator = float(csr_matrix.sum(dtm[:i, :])) numerator = csr_matrix.sum(dtm[:i, matching_current_index]) df.loc[i, 'mean_term_freq'] = numerator / denominator #Normalize the current features norm_feat_list = ['cum_time', 'cum_char', 'mean_days_since'] df = normalize_features(df, norm_feat_list) #Create interaction terms with cumulative time and character count features df.loc[:, 'timeXper_seen'] = df.loc[:, 'norm_cum_time'] * df.loc[:, 'percent_seen'] df.loc[:, 'timeXdays_since'] = df.loc[:, 'norm_cum_time'] * df.loc[:, 'norm_mean_days_since'] df.loc[:, 'timeXterm_freq'] = df.loc[:, 'norm_cum_time'] * df.loc[:, 'mean_term_freq'] return df