def test_constructor_twoline(self): pm_inst = ParseAndModel( feature_list=["screen"], filename='../tests/data/parse_and_model/twoLineTest.txt', log_base=2) em = EmVectorByFeature(explicit_model=pm_inst) expected_section_word_counts_matrix = [[1, 1, 1, 0, 0], [1, 0, 0, 1, 1]] expected_model_background_matrix = np.array( [1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6]) expected_model_feature_matrix = np.array([[0.218], [0.282], [0.282], [0.109], [0.109]]) self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix, csr_matrix.toarray(em.reviews_matrix)), msg="section counts do not match") self.assertEqual(True, np.array_equiv( expected_model_background_matrix, csr_matrix.toarray(em.background_probability)), msg="background model does not match") self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, np.round(em.topic_model, 3)), msg="topic models do not match") print("testing")
def test_bem_two_section(self): pm = ParseAndModel() section_list = pd.DataFrame([[0, 0, "large clear screen", True] , [0, 1, "large broken bad", True] ], columns=["doc_id", "section_id", "section_text", "title"]) pm.feature_list = ["screen"] pm.formatted_feature_list = pm.format_feature_list() pm.parsed_text = dict(section_list=section_list) pm.model_results = pm.build_explicit_models(lemmatize_words=False, log_base=2) expected_model_background = [1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6] expected_model_feature = [[0.218, 0.282, 0.282, 0.109, 0.109]] expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1}) , 1: Counter({"large": 1, "broken": 1, "bad": 1})} expected_section_word_counts_matrix = [[1, 1, 1, 0, 0] , [1, 0, 0, 1, 1]] expected_model_background_matrix = np.array([1 / 3, 1 / 6, 1 / 6, 1 / 6, 1 / 6]) expected_model_feature_matrix = np.array([[0.218], [0.282], [0.282], [0.109], [0.109]]) expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen', 3: 'broken', 4: 'bad'} self.assertEqual(True, expected_model_background == pm.model_results["model_background"]) self.assertEqual(True, expected_model_feature == [[round(val, 3) for val in feature_model] for feature_model in pm.model_results["model_feature"]]) # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"]) self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix, csr_matrix.toarray(pm.model_results["section_word_counts_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_background_matrix, csr_matrix.toarray(pm.model_results["model_background_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, np.round(pm.model_results["model_feature_matrix"], 3))) self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
def test_bem_one_section(self): pm = ParseAndModel() section_list = pd.DataFrame([[0, 0, "large clear screen", True] ], columns=["doc_id", "section_id", "section_text", "title"]) pm.feature_list = ["screen"] pm.formatted_feature_list = pm.format_feature_list() pm.parsed_text = dict(section_list=section_list) pm.model_results = pm.build_explicit_models(log_base=2) expected_model_background = [1 / 3, 1 / 3, 1 / 3] expected_model_feature = [[1 / 3, 1 / 3, 1 / 3]] expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})} expected_section_word_counts_matrix = [[1, 1, 1]] expected_model_background_matrix = np.array([1 / 3, 1 / 3, 1 / 3]) expected_model_feature_matrix = np.array([[1 / 3], [1 / 3], [1 / 3]]) expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen'} self.assertEqual(True, expected_model_background == pm.model_results["model_background"]) self.assertEqual(True, expected_model_feature == pm.model_results["model_feature"]) # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"]) self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix, csr_matrix.toarray(pm.model_results["section_word_counts_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_background_matrix, csr_matrix.toarray(pm.model_results["model_background_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, pm.model_results["model_feature_matrix"])) self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
def fit(self, X, y): """Fit KFDA model. Parameters ---------- X: numpy array of shape [n_samples, n_features] Training set. y: numpy array of shape [n_samples] Target values. Only works for 2 classes. Returns ------- self """ n = len(X) self._X = X self._H = np.identity(n) - np.matmul( 1. / n * np.ones(n), np.ones(n).T) #np.ones(n) , np.ones(n).T self._E = OneHotEncoder().fit_transform(y.reshape(n, 1)) _, counts = np.unique(y, return_counts=True) K = self._kernel(X) C = np.matmul(np.matmul(self._H, K), self._H) self._Delta = np.linalg.inv(C + self.lmb * np.identity(n)) A = np.matmul(csc_matrix.toarray(self._E.T), C) B = np.matmul(self._Delta, csr_matrix.toarray(self._E)) self._Pi_12 = np.diag(np.sqrt(1.0 / counts)) P = np.matmul(self._Pi_12, A) Q = np.matmul(B, self._Pi_12) R = np.matmul(P, Q) V, self._Gamma, self._U = np.linalg.svd(R, full_matrices=False) return self
def normalize_transform(self, mode='clr'): """ Some operations may require transformed data. This function performs normalization and a clr transform on all OTU tables in a Batch object. It returns a deep copy of the original Batch object, so the original file is not modified. :param mode: transformation mode; clr (centered log-ratio) or ilr (isometric log-ratio) :return: Transformed copy of Batch object. """ batchcopy = copy.deepcopy(self) try: for x in list(self.otu): # normalizes the data by samples normbiom = batchcopy.otu[x].norm(axis='sample', inplace=False) mat = csr_matrix.toarray(normbiom.matrix_data) # replaces all zeros with a small value # multiplicative replacement preserves ratios between values mat = multiplicative_replacement(mat) if mode is 'clr': mat = clr(mat) elif mode is 'ilr': mat = ilr(mat) else: raise ValueError("Only CLR and ILR transformations are currently supported.") normbiom._data = csc_matrix(mat) batchcopy.otu[x] = normbiom except Exception: logger.error("Failed to normalize data", exc_info=True) return batchcopy
def load_train(filename, i, nb_timesteps, output_dim): x_train = pkl.load(open('db/serialized/' + filename + '_x_train' + str(i+1) + '.np', 'rb')) y_train = pkl.load(open('db/serialized/' + filename + '_y_train' + str(i+1) + '.np', 'rb')) x_train = csr_matrix.toarray(x_train) x_train = np.resize(x_train, (x_train.shape[0], nb_timesteps, x_train.shape[1])) y_train = np.resize(y_train, (y_train.shape[0], output_dim)) return x_train, y_train
def predict(self, X): X = csr_matrix.toarray(self._fix_test_feats(X)) W = np.transpose(self.W) yhat = np.matmul(X, W) predictions = np.zeros(len(yhat), dtype=int) for i in range(len(predictions)): predictions[i] = np.argmax(yhat[i]) return predictions
def cos_similarity(X, df, your_pick): # Compute similarity of movie: Melvin and Howard index = df[df['Title'] == your_pick].index[0] d1 = list(csr_matrix.toarray(X[index])) mag_d1 = np.linalg.norm(d1) dist = [] for i in range(X.shape[0]): row = list(csr_matrix.toarray(X[i])) dot_product_xy = np.multiply(d1, row).sum(1) mag_row = np.linalg.norm(row) x_time_y = mag_d1 * mag_row dist.append(dot_product_xy/x_time_y) dist_series = pd.Series(dist) dist_series = dist_series.sort_values(ascending=False) dist_series.iloc[1:6] dist_series = pd.DataFrame(dist_series) return dist_series
def fit(self, *, X, y, lr): W = self.W for obs in range(X.shape[0]): # once for each observation x = csr_matrix.toarray(X[obs]) check = np.dot(x, np.transpose(W)) yhat = np.argmax(np.dot(x, np.transpose(W))) if yhat != y[obs]: W[yhat] = W[yhat] - lr * X[obs] W[y[obs]] = W[y[obs]] + lr * X[obs] self.W = W
def train_model(data_cleaned,vocab,num_featuers): #This code was adapted from session 2 posted by Dr Jose Camacho Collados Oct-2019 #accessed Nov-2019 #https://learningcentral.cf.ac.uk/webapps/blackboard/content/listContent.jsp?course_id=_393342_1&content_id=_5178506_1 #Apply most frequent words technique #extracting 1st dimension of features(most frequent words , also splitting the features from target column ( both are stored as list) X_train=[] Y_train=[] for i,review in data_cleaned.iterrows(): vector_review=get_vector_text(vocab,data_cleaned.at[i,'token']) X_train.append(vector_review) Y_train.append(data_cleaned.at[i,'label']) #Convert them to arrays (NumPy libraries) X_train_sentanalysis=np.asarray(X_train) Y_train_sentanalysis=np.asarray(Y_train) #End adapted code #extracting 2nd dimension of features(TF-IDF), then converting it to array using (Scipy library because the returned data type is csr.csr_matrix in Scipy lib ) X_tfId=get_tf_idf(data_cleaned,num_featuers,stop_words) X_tfId=sc.toarray(X_tfId) #extracting 3rd dimension of features(HashingVectorizer), then converting it to array using (Scipy library) X_hash= get_Hashing(data_cleaned,num_featuers) X_hash=sc.toarray(X_hash) #Concatenate all 3 dimensions to one matrix X_tfId=np.concatenate((X_tfId,X_hash), axis=1) X_train_sentanalysis = np.concatenate((X_train_sentanalysis,X_tfId), axis=1) #Define a pipeline contains Feature selection technique and the model and the model #Feature selection technique used is selectKbest with chi2 , and the value of k is set to get the half of concatenated features #in first iteration of training/validate model :each feature generate 1000 column (3000 in total) so after feature selection will #will reduced to (1500 feature vector) which are the most wighted feature. # the model is logisticRegression, it is a classifier its solver set to sag due to the size of data(large) #the motivation behind do them in one pipeline to minimise the steps of fitting and transforming selection the fit again with model, #also apply .predict with dev/test (in their stages) without needing to apply (fit_transform)sfeatuer_election separately then predict them. model_pipline = Pipeline(steps=[("dimension_reduction", SelectKBest(chi2, k=(int(num_featuers*.5)))), ("classifiers", LogisticRegression(solver='sag', max_iter=2000))])#edit the default value of max_iter(100) model_pipline.fit(X_train_sentanalysis,Y_train_sentanalysis) #return the trained model return model_pipline
def fit(self, *, X, y, lr): W = self.W for obs in range(len(y)): # once for each observation x = csr_matrix.toarray(X[obs]) g = np.dot(W, np.transpose(x)) for k in range(len(W)): p = self.softmax(g) correction = p[k] * x if k == y[obs]: W[k:k + 1] += lr * (x - correction) else: W[k:k + 1] -= lr * correction self.W = W
def write_csr_to_csv(csr_matrix, name): graph = csr_matrix.toarray() filename = name #print("length of graph array:", len(graph)) #print("csr matrix array:", graph) #print("length of one row:", len(graph[0])) with open(filename, "w") as writefile: writer = csv.writer(writefile) writer.writerow(["From", "To", "Weight"]) #print("WRITING CSR MATRIX TO CSV AT: ", filename) for (m, n), value in np.ndenumerate(graph): if value != 0: writer.writerow([m, n, value])
def reshape_input_data(x_ro, x_md): """ Concatenates the input data into shape (num_samples, sample_size, 2). Parameters ---------- x_ro: sparse matrix TF-IDF encoding of Romanian input samples. x_md: sparse matrix TF-IDF encoding of Moldavian input samples. Returns ------- result Numpy ndarray representing the concatenated data. """ assert x_ro.shape == x_md.shape num_samples, sample_size = x_ro.shape result = np.stack([csr_matrix.toarray(x_ro), csr_matrix.toarray(x_md)], axis=-1) return result
def test_constructor_one_section(self): pm = ParseAndModel(feature_list=["screen"], filename='data/parse_and_model/twoLineTest.txt', lemmatize_words=False, nlines=1) section_list = pd.DataFrame([[0, 0, "large clear screen", True] ], columns=["doc_id", "section_id", "section_text", "title"]) expected_model_background = [1 / 3, 1 / 3, 1 / 3] expected_model_feature = [[1 / 3, 1 / 3, 1 / 3]] expected_section_word_counts = {0: Counter({"large": 1, "clear": 1, "screen": 1})} expected_section_word_counts_matrix = [[1, 1, 1]] expected_model_background_matrix = np.array([1 / 3, 1 / 3, 1 / 3]) expected_model_feature_matrix = np.array([[1 / 3], [1 / 3], [1 / 3]]) expected_vocab_lookup = {0: 'large', 1: 'clear', 2: 'screen'} self.assertEqual(True, expected_model_background == pm.model_results["model_background"]) self.assertEqual(True, expected_model_feature == pm.model_results["model_feature"]) # self.assertEqual(True, expected_section_word_counts == em_input["section_word_counts"]) self.assertEqual(True, np.array_equiv(expected_section_word_counts_matrix, csr_matrix.toarray(pm.model_results["section_word_counts_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_background_matrix, csr_matrix.toarray(pm.model_results["model_background_matrix"]))) self.assertEqual(True, np.array_equiv(expected_model_feature_matrix, pm.model_results["model_feature_matrix"])) self.assertEqual(True, expected_vocab_lookup == pm.model_results["vocabulary_lookup"])
def sampling(adata, axis = 0, nsamples=500, method = "sps", optm_parameters=True, pinit=0.195, pfin = 0.9, K=500): ob = adata.X ob = csr_matrix.toarray(ob) #sampling rows if(axis == 0): ob = ob.T # print(ob.shape) if(nsamples>=ob.shape[1]): print("Number of samples are greater than number of columns. Sampling cant be done") exit(0) no_samples = ob.shape[1] init = no_samples if no_samples < 20000 else min(20000,round(no_samples/3)) # random sample of ids from sample = 0 to no_samples - 1 of size init sample_ids = np.random.choice(list(range(0, no_samples,1)), init) data = normalize(ob) data = np.take(ob, sample_ids, axis = 1) partition = annPartition(data) if(optm_parameters==True): param = optimized_param(partition, nsamples) pinit = param[0] pfin = param[1] K = param[2] print("Optimized parameters: ", param,"\n") unique_elements, counts_elements = np.unique(partition[:,1], return_counts=True) cluster_freq = np.asarray((counts_elements), dtype = int) # print(cluster_freq.shape) prop = np.round((pinit - np.exp(-cluster_freq/K) * (pinit - pfin) )* cluster_freq) cluster_freq = np.vstack((cluster_freq,prop)).T subsamples = np.empty((0)) for i in range(len(prop)): subsamples = np.concatenate((subsamples, np.random.choice(partition[partition[:,1]==i,0], size = int(prop[i]), replace = False)), axis = None) subsamples = np.asarray(subsamples, dtype = int) print(len(subsamples), "Samples extracted. Returning indices of samples") # Returning indices of selected samples return subsamples
def run(self, filter_xtrim, by_group): if len(self.id2doc) == 0: if by_group: os.makedirs(os.path.dirname(f"{self.dir}/{self.sentiment}_model/"), exist_ok=True) documents = self.prepare_documents_by_group(filter_xtrim) self.transform_into_featuresets(documents) else: os.makedirs(os.path.dirname(f"{self.dir}/{self.sentiment}_model/"), exist_ok=True) documents = self.prepare_documents(filter_xtrim) self.transform_into_featuresets(documents) os.makedirs(os.path.dirname(f"{self.dir}/{self.sentiment}_results/local_v_foreign/"), exist_ok=True) doc_pairs = {} doc_count = 0 for doc_num in self.id2doc: doc_count += 1 doc = self.id2doc[doc_num] # get word probability distribution - it is l2 normalized prob_dist = csr_matrix.toarray(self.tfidf_matrix[doc_num, :])[0] # print(np.sum(np.square(prob_dist))) # get term (features) and its probability in descending format sorted_indices = np.argsort(prob_dist)[::-1] sorted_features = np.array(self.tfidf_vectorizer.get_feature_names())[sorted_indices] temp = [i for i in prob_dist] temp.sort() sorted_prob = temp[::-1] word_prob = list(zip(sorted_features, sorted_prob)) # keep words with probability more than 0 and sentiment prob is larger than 0.9 rep_words = [] for w in [w for w in word_prob if w[1] > 0]: sent = " ".join(w[0].split("_")) sent_prob = self.sentiment_classifier.sentiment(sent) if sent_prob[0] == self.sentiment and sent_prob[1] > 0.6: rep_words.append(w) doc_pairs.setdefault(doc.name,[]).append((doc.location, rep_words)) print("\r",end="") print("Getting relevant sentimental words", int(doc_count/len(self.id2doc) * 100), "percent", end="", flush=True) # local-foreign review difference doc_count = 0 for doc_name in doc_pairs: doc_count += 1 local_pdist = [] foreign_pdist = [] # find unique words for loc_prob_tuple in doc_pairs[doc_name]: if loc_prob_tuple[0] == "sgp": local_pdist = loc_prob_tuple[1] else: foreign_pdist = loc_prob_tuple[1] local_dict = {k: v for (k, v) in local_pdist} foreign_dict = {k: v for (k, v) in foreign_pdist} wdiff = self.rank_words(local_dict) filename = doc_name.replace(".csv", "") + "_sgp.csv" with open(f"{self.dir}/{self.sentiment}_results/local_v_foreign/{filename}","w", encoding="utf8") as writer: writer.writelines([f"{w[0]},{w[1]}\n" for w in wdiff]) wdiff = self.rank_words(foreign_dict) filename = doc_name.replace(".csv", "") + "_ovs.csv" with open(f"{self.dir}/{self.sentiment}_results/local_v_foreign/{filename}", "w", encoding="utf8") as writer: writer.writelines([f"{w[0]},{w[1]}\n" for w in wdiff])
Nz, Dz = X.shape s = (np.ones((Nz, 1))) * 0.2 #10,60 Knum = 5 W = kneighbors_graph(X, Knum, mode='distance', include_self=True) hidden_size = 30 maps = spectral_embedding(W, n_components=hidden_size) ###################################################################### W = W W = sparse.csr_matrix(W) W1 = W.toarray() W = csr_matrix.toarray(W) params, gammasC = lr_init(maps, K, K) print(maps.dtype) model = GAN(hidden_size, batchSize, 1e-1, maps) numIter = 0 loss_value = 0 training_loss = 0 training_loss1 = 0 while numIter < 100: gammasC, params, P = bayesianLowrankModel(maps, params, gammasC, K, K, W) for i in range(batch): images = X[batchSize * i:batchSize * (i + 1), :] / 255 maps1 = maps[batchSize * i:batchSize * (i + 1), :] R_loss, loss_value, loss_value1 = model.update_params1( images, images, maps1)
import numpy as np import os import tim import pandas as pd import joblib from sklearn import metrics from sklearn.model_selection import cross_val_score from sklearn.model_selection import cross_val_predict from sklearn.svm import LinearSVC from scipy.sparse import csr_matrix from sklearn.model_selection import learning_curve from sklearn.model_selection import ShuffleSplit import matplotlib.pyplot as plt original_set_train = csr_matrix.toarray(original_set_train) original_set_test = csr_matrix.toarray(original_set_test) def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)): """ Generate 3 plots: the test and training learning curve, the training samples vs fit times curve, the fit times vs score curve. Parameters ---------- estimator : object type that implements the "fit" and "predict" methods An object of that type which is cloned for each validation. title : string Title for the chart. X : array-like, shape (n_samples, n_features) Training vector, where n_samples is the number of samples and
dropout_rate = 0.4 # TF-IDF / fitting and transforming train data (node embedding) vect = TfidfVectorizer( decode_error="ignore", sublinear_tf=True, ngram_range=(1, 1), min_df=0.0149, max_df=0.9, binary=False, smooth_idf=True, ) X_embed = vect.fit_transform(cleaned_train_data + cleaned_test_data) # Setting the feature of all nodes features_matrix = csr_matrix.toarray(X_embed) # Creating indices to split data into training and test sets idx = np.random.RandomState(seed=42).permutation(n_hosts) index_train = idx[: int(0.8 * n_hosts)] index_test = idx[int(0.8 * n_hosts) :] # Transforming the numpy matrices/vectors to torch tensors features = torch.FloatTensor(features_matrix) y = torch.LongTensor(y) adj = torch.FloatTensor(adj) index_train = torch.LongTensor(index_train) index_test = torch.LongTensor(index_test) # Applying the GNN model on the subgraph H model = GNN(features.shape[1], n_hidden_1, n_hidden_2, n_class, dropout_rate)
def VB_Decomp_Gen(M: Union[csr_matrix, np.ndarray], rank: int, maxiter: int = 100) -> Tuple[np.ndarray, np.ndarray]: # init I = M.shape[0] J = M.shape[1] n = rank sigma_sq = np.ones(n) rho_sq = np.ones(n) / n tau_sq = 1 u_bar = [] v_bar = [] t = [] S, Phi, Psi = [], [], [] for i in range(0, I): Phi.append(np.eye(n)) u_bar.append(np.random.normal(0, 1, n)) for j in range(0, J): Psi.append(np.eye(n)) S.append(np.diag(1 / rho_sq)) t.append(np.zeros(n)) v_bar.append(np.random.normal(0, 1, n)) Phi = np.array(Phi) Psi = np.array(Psi) S = np.array(S) t = np.array(t) u_bar = np.array(u_bar) v_bar = np.array(v_bar) norm_u = 0 norm_v = 0 N = [] for i in range(0, I): N.append(scipy.sparse.find(M[i])[1]) ob = scipy.sparse.find(M) # EM iteration for iter in range(0, maxiter): # E step # update Q(u_i) for i in range(0, I): outer = np.zeros((n, n)) N_i = N[i] for j in N_i: outer += np.outer(v_bar[j], v_bar[j]) Phi[i] = np.linalg.inv( np.diag(1 / sigma_sq) + (Psi[N_i].sum(0) + outer) / tau_sq) mtplr = ((M[i, N_i] * (v_bar[N_i])) / tau_sq).sum(0) u_bar[i] = Phi[i].dot(mtplr) S[N_i] += (Phi[i] + np.outer(u_bar[i], u_bar[i])) / tau_sq t[N_i] += (np.outer(csr_matrix.toarray(M[i, N_i]), (u_bar[i])) / tau_sq) #update Q(v_j) Psi = np.linalg.inv(S) for j in range(0, J): v_bar[j] = Psi[j].dot(t[j]) # M step for l in range(0, n): sigma_sq[l] = ((Phi[:, l, l] + u_bar[:, l]**2).sum()) / (I - 1) K = len(ob[1]) Tr = 0 for i, j in np.array([ob[0], ob[1]]).T: A = Phi[i] + np.outer(u_bar[i], u_bar[i]) B = Psi[j] + np.outer(v_bar[j], v_bar[j]) Tr += np.trace(A.dot(B)) tau_sq = (((ob[2]**2) - (2 * ob[2] * np.einsum( 'ij,ij->i', u_bar[ob[0]], v_bar[ob[1]]))).sum() + Tr) / (K - 1) cur_norm_u = np.linalg.norm(u_bar) cur_norm_v = np.linalg.norm(v_bar) if (abs(cur_norm_u - norm_u) < 0.01 or abs(cur_norm_v - norm_v) < 0.01): break else: norm_u, norm_v = cur_norm_u, cur_norm_v yield np.array(u_bar), np.array(v_bar)
def GDS_model(train): x_train, x_test, y_train, y_test = load(train) ## Trim data l = int(len(y_train)/16)*16 x_train = x_train[0:l] y_train = y_train[0:l] x_train = x_train[0:16] y_train = y_train[0:16] l = int(len(y_test)/16)*16 x_test = x_test[0:l] y_test = y_test[0:l] ## Network structure nb_timesteps = 1 nb_features = x_train.shape[1] output_dim = 1 ## cross-validated model parameters batch_size = 16 dropout = 0.25 activation = 'sigmoid' nb_hidden = 128 initialization = 'glorot_normal' ## reshaping X to three dimensions x_train = csr_matrix.toarray(x_train) x_train = np.resize(x_train, (x_train.shape[0], nb_timesteps, x_train.shape[1])) x_test = csr_matrix.toarray(x_test) x_test = np.resize(x_test, (x_test.shape[0], nb_timesteps, x_test.shape[1])) ## reshape Y to appropriate dimensions y_train = np.resize(y_train, (y_train.shape[0], output_dim)) y_test = np.resize(y_test, (y_test.shape[0], output_dim)) ## Initialize model model = Sequential() model.add(Masking(mask_value=0., batch_input_shape=(batch_size, nb_timesteps, nb_features), name='Mask')) # embedding for variable input lengths model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU01', batch_input_shape=(batch_size, nb_timesteps, nb_features))) model.add(Dropout(dropout, name='DO_01')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU02')) model.add(Dropout(dropout, name='DO_02')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU03')) model.add(Dropout(dropout, name='DO_03')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU04')) model.add(Dropout(dropout, name='DO_04')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU05')) model.add(Dropout(dropout, name='DO_05')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU06')) model.add(Dropout(dropout, name='DO_06')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU07')) model.add(Dropout(dropout, name='DO_07')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU08')) model.add(Dropout(dropout, name='DO_08')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU09')) model.add(Dropout(dropout, name='DO_09')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU10')) model.add(Dropout(dropout, name='DO_10')) model.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name='GRU11')) model.add(Dropout(dropout, name='DO_11')) model.add(GRU(nb_hidden, stateful=True, init=initialization, name='GRU12')) model.add(Dropout(dropout, name='DO_12')) model.add(Dense(output_dim, activation=activation, name='Output')) # Configure learning process model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) # Prepare model checkpoints and callbacks filepath="db/results/"+ train +"_best_weights.h5" checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=False) csv_logger = CSVLogger('db/results/training_log.csv', separator=',', append=True) # Training print('Training') history = model.fit(x_train, y_train, batch_size=batch_size, verbose=1, epochs=1, shuffle=False, # turn off shuffle to ensure training data patterns remain sequential callbacks=[checkpointer,csv_logger], validation_data=(x_test, y_test)) ## Evaluating on best results model.load_weights(filepath=filepath) score = model.evaluate(x_test, y_test, batch_size=16, verbose=1) score = dict(zip(model.metrics_names, score)) summary = model.summary() model.save('db/results/model_'+ train + '.h5') return history, score, summary
train = pd.read_csv("training.csv", low_memory=False, index_col="article_number") test = pd.read_csv("test.csv", low_memory=False, index_col="article_number") # Create Ordinal Encoding le = LabelEncoder().fit(train.topic) train["label"] = le.transform(train.topic) test["label"] = le.transform(test.topic) # Split into x and y train_x = train.drop(["label", "topic"], axis=1) test_x = test.drop(["label", "topic"], axis=1) train_y = train["label"] test_y = test["label"] #create TFIDF features from article words tfidf = TfidfVectorizer(max_features=500).fit(train_x.article_words) # Transform words and convert from sparse matrix to array train_tfidf = csr_matrix.toarray(tfidf.transform(train_x.article_words)) test_tfidf = csr_matrix.toarray(tfidf.transform(test_x.article_words)) words = train_tfidf test_words = test_tfidf #Final model -- for how parameters were found see logistic_regression_hpo.py model=LogisticRegression(C= 16, class_weight='balanced',penalty='l2',max_iter=700) model.fit(words, train_y) print( classification_report(test_y, model.predict(test_words)) )
def score(self, X): X = csr_matrix.toarray(self._fix_test_feats(X)) W = self.W yhat = np.matmul(W, np.transpose( X)) # yhat[k, i] gives prob that sample xi is in class k return yhat[1]
def IKPP_model(test): ## set time time1 = datetime.datetime.today() ## Trim data x_train, x_test, y_train, y_test = load(test) l = int(len(y_train)/16)*16 x_train = x_train[0:l] y_train = y_train[0:l] l = int(len(y_test)/16)*16 x_test = x_test[0:l] y_test = y_test[0:l] ## Network structure nb_timesteps = 1 nb_features = x_train.shape[1] output_dim = 1 ## cross-validated model parameters batch_size = 16 dropout = 0.25 activation = 'sigmoid' nb_hidden = 128 initialization = 'glorot_normal' ## reshaping X to three dimensions x_train = csr_matrix.toarray(x_train) x_train = np.resize(x_train, (x_train.shape[0], nb_timesteps, x_train.shape[1])) x_test = csr_matrix.toarray(x_test) x_test = np.resize(x_test, (x_test.shape[0], nb_timesteps, x_test.shape[1])) ## reshape Y to appropriate dimensions y_train = np.resize(y_train, (y_train.shape[0], output_dim)) y_test = np.resize(y_test, (y_test.shape[0], output_dim)) ## Load model IKPP = load_model('db/results/model_' + train + '.h5') IKPP.load_weights('db/results/' + train + '_best_weights.h5') ## Freeze layers for layer in IKPP.layers[:20]: layer.trainable = False ## Reset weights reset = 0 if reset == 1: for layer in IKPP.layers[-6:]: layer.reset_states() ## Decoder decoder = Sequential() decoder.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name="Encoder", batch_input_shape=(batch_size, nb_timesteps, nb_features))) decoder.add(GRU(nb_hidden, return_sequences=True, stateful=True, init=initialization, name="Decoder")) decoder.add(Dense(IKPP.layers[0].input_shape[2], activation="linear")) # plot_model(decoder, 'db/models/decoder.png') ## Combine models #merged = Sequential() #merged.add(decoder) #merged.add(IKPP) merged = Model(inputs=decoder.input, outputs=IKPP(decoder.output)) merged.layers[-1].get_input_at(-2) merged.layers[-1].get_input_mask_at(-3) merged.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy']) # Prepare model checkpoints and callbacks filepath="db/results/" + test + "_best_weights.h5" checkpointer = ModelCheckpoint(filepath=filepath, verbose=0, save_best_only=False) csv_logger = CSVLogger('db/results/model_' + test + '.csv', separator=',', append=True) ## Predict un-tuned model score_UT = merged.evaluate(x_test, y_test, verbose=1, batch_size=16) score_UT = dict(zip(merged.metrics_names, score_UT)) # Training print('Training') while (True): time2 = datetime.datetime.today() history = merged.fit(x_train, y_train, batch_size=batch_size, verbose=1, nb_epoch=1, shuffle=False, # turn off shuffle to ensure training data patterns remain sequential callbacks=[checkpointer, csv_logger], validation_data=(x_test, y_test)) time3 = datetime.datetime.today() if ((time3 - time2).seconds*2 + (time2-time1).seconds >= 600): break ## Evaluating on best results merged.load_weights(filepath=filepath) score = merged.evaluate(x_test, y_test, batch_size=16, verbose=1) score = dict(zip(merged.metrics_names, score)) summary = merged.summary() merged.save('db/results/model_'+ test + '.h5') return history,
Z = Z.reshape(xx.shape) out = ax.contourf(xx, yy, Z, **params) return out def make_meshgrid(x, y, h=.02): x_min, x_max = x.min() - 1, x.max() + 1 y_min, y_max = y.min() - 1, y.max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) return xx, yy if __name__ == '__main__': train_features, train_labels = load_svmlight_file('twofeature.txt') x0, x1 = csr_matrix.toarray(train_features).T colors = ['red' if label == 1 else 'green' for label in train_labels] xx, yy = make_meshgrid(x0, x1) fig, ax = plt.subplots(1, 2) indC = 0 for C in [1, 100]: clf = svm.SVC(kernel='linear', C=C) clf.fit(np.array([x0, x1]).T, train_labels) plot_contours(ax[indC], clf, xx, yy) ax[indC].scatter(x0, x1, c=train_labels, cmap=plt.cm.coolwarm, edgecolors='k') ax[indC].set_title(f'C={C}') indC += 1
def rmse(y, pred): sub = [(a_i - b_i)**2 for a_i, b_i in zip(csr_matrix.toarray(y), pred)] sum_sqr = sum(sum(sub)) size = y.shape[0] * y.shape[1] return sqrt(sum_sqr / size)
def load_test(filename): x_test = pkl.load(open('db/serialized/'+filename+'_x_test.np', 'rb')) y_test = pkl.load(open('db/serialized/'+filename+'_y_test.np', 'rb')) x_test = csr_matrix.toarray(x_test) return (x_test, y_test)
modelo = naive_bayes.fit(X_, y_) print("Modelo creado: " + str(modelo)) return modelo def crearModelo_RForest(X_, y_): modelo_ranForest = RandomForest.fit(X_, y_) print("Modelo creado: " + str(modelo_ranForest)) return modelo_ranForest ########################################################### support_vectors = modelo_svm.support_vectors_ X_train_ = csr_matrix.toarray(X_train) support_vectors_ = csr_matrix.toarray(support_vectors) plt.scatter(X_train_[:, 0], X_train_[:, 1]) plt.scatter(support_vectors_[:, 0], support_vectors_[:, 1], color='red') plt.title('Grafica de matriz de confusión') plt.xlabel('X1') plt.ylabel('X2') plt.show() X_test_ = csr_matrix.toarray(X_test) y_test_ = pd.Series.to_numpy(y_test) value = 1.5 width = 0.75 plot_decision_regions(X_test_, y_test_, clf=modelo_svm,
nb_classes = 2 nb_features = X_train.shape[1] output_dim = 1 # Define cross-validated model parameters batch_size = 14 dropout = 0.25 activation = 'sigmoid' nb_hidden = 128 initialization = 'glorot_normal' # # Reshape X to three dimensions # # Should have shape (batch_size, nb_timesteps, nb_features) X_train = csr_matrix.toarray(X_train) # convert from sparse matrix to N dimensional array X_train = np.resize(X_train, (X_train.shape[0], nb_timesteps, X_train.shape[1])) print('X_train shape:', X_train.shape) X_test = csr_matrix.toarray(X_test) # convert from sparse matrix to N dimensional array X_test = np.resize(X_test, (X_test.shape[0], nb_timesteps, X_test.shape[1])) print('X_test shape:', X_test.shape) # Reshape y to two dimensions # Should have shape (batch_size, output_dim) y_train = np.resize(y_train, (X_train.shape[0], output_dim))
print('Validating model with '+str(num_features)+' features...') X_dev_sentanalysis=[] Y_dev= [] for i,review in dev_cleaned.iterrows(): #extracting 1st dimension of features(most frequent words , also spliting the fetuers from target column ( both are stored as list) vector_instance=get_vector_text(vocabulary[:num_features],dev_cleaned.at[i,'token']) X_dev_sentanalysis.append(vector_instance) Y_dev.append(dev_cleaned.at[i,'label']) #convert previous list to arrays(NumPy librray) for prediction on the model X_dev_sentanalysis=np.asarray(X_dev_sentanalysis) Y_dev_gold=np.asarray(Y_dev) #extracting 2nd dimenstion of featuers(TF-IDF), then converting it to array using (Scipy library) X_dev_TF1=get_tf_idf(dev_cleaned, num_features,stop_words) X_dev_TF=sc.toarray(X_dev_TF1) #extracting 3rd dimenstion of featuers(HashingVectorizer), then converting it to array using (Scipy library ) X_dev_hash= get_Hashing(dev_cleaned,num_features) X_dev_hash=sc.toarray(X_dev_hash) #Concatenate all 3 dimensions to one matrix X_dev_TF=np.concatenate((X_dev_TF,X_dev_hash), axis=1) X_dev = np.concatenate((X_dev_sentanalysis,X_dev_TF), axis=1) ####### #Predicting featuers of Dev set, then calculating the performance measuers Y_dev_predictions=model.predict(X_dev) print('Done') # print('\n')