def plot(X_transform, model, figsize=(15, 6)): """ Funcao responsavel por plotar um grafico dispersao para avaliar como estar a distribuicao dos dados apos treinado. ---------- parameters: X_transform: Dados transformado em versao numerica model: instancia do modelo kmeans apos o treino figsize: tamanho do grafico """ # Usando o SVD para diminuir a dimencionalidade dos dados para 2 dimenções. svd = SVD(n_components=2, random_state=0) vectorizer_2D = svd.fit_transform(X_transform) plt.figure(figsize=figsize) plt.scatter(vectorizer_2D[:, 0], vectorizer_2D[:, 1], c=model.labels_) plt.scatter(model.cluster_centers_[:, 0], model.cluster_centers_[:, 1], s=200, color='black', label='Centroids') plt.title('Cluster of Tweets') plt.legend() plt.show()
def infer_topics(self, num_topics=10, **kwargs): self.nb_topics = num_topics nmf = SVD(n_components=num_topics) topic_document = nmf.fit_transform(self.corpus.sklearn_vector_space) self.topic_word_matrix = [] self.document_topic_matrix = [] vocabulary_size = len(self.corpus.vocabulary) row = [] col = [] data = [] for topic_idx, topic in enumerate(nmf.components_): for i in range(vocabulary_size): row.append(topic_idx) col.append(i) data.append(topic[i]) self.topic_word_matrix = coo_matrix((data, (row, col)), shape=(self.nb_topics, len(self.corpus.vocabulary))).tocsr() row = [] col = [] data = [] doc_count = 0 for doc in topic_document: topic_count = 0 for topic_weight in doc: row.append(doc_count) col.append(topic_count) data.append(topic_weight) topic_count += 1 doc_count += 1 self.document_topic_matrix = coo_matrix((data, (row, col)), shape=(self.corpus.size, self.nb_topics)).tocsr()
def main(): if len(sys.argv) < 2: print('Expected arguments are not provided.') return actorid = int(sys.argv[1]) imdb_actor_info = util.read_imdb_actor_info() input_actor_name = imdb_actor_info[imdb_actor_info['id'] == actorid]['name'].values[0] tf_idf_matrix = util.get_tf_idf_matrix() #print(tf_idf_matrix) actor_tf_idf = tf_idf_matrix.loc[actorid] #print(actor_tf_idf) svd = SVD(n_components=no_of_components) svd.fit(tf_idf_matrix) svd_df = pd.DataFrame(svd.transform(tf_idf_matrix), index=tf_idf_matrix.index) input_actor_row = svd_df.loc[actorid] actors = [] for index, row in svd_df.iterrows(): name = imdb_actor_info[imdb_actor_info['id'] == index]['name'].values[0] actors.append((index, name, 1 - cosine(row, input_actor_row))) other_actors = list(filter(lambda tup: tup[0] != actorid, actors)) other_actors.sort(key=lambda tup: tup[2], reverse=True) util.print_output(actorid, input_actor_name, other_actors[:no_of_actors]) util.write_output_file(actorid, input_actor_name, other_actors[:no_of_actors], output_file)
def test_grid_search_model(self): X, y = make_classification(random_state=42) param_grid = [{ 'pca': [PCA(2)], 'lr__fit_intercept': [False, True] }, { 'pca': [SVD(2)], 'lr__fit_intercept': [False, True] }] pipe = Pipeline([('pca', 'passthrough'), ('lr', LogisticRegression())]) grid0 = GridSearchCV(pipe, param_grid, error_score='raise') grid0.fit(X, y) pipe = PipelineCache([('pca', 'passthrough'), ('lr', LogisticRegression())], 'cache__3') grid = GridSearchCV(pipe, param_grid, error_score='raise') grid.fit(X, y) cache = MLCache.get_cache('cache__3') # 0.22 increases the number of cached results self.assertIn(len(cache), (7, 11)) key = list(cache.keys())[0] self.assertIn("[('X',", key) self.assertIn("('copy', 'True')", key) MLCache.remove_cache('cache__3') self.assertEqual(grid0.best_params_, grid.best_params_)
def get_lsa(): print("lsa") train = get_tfidf("lr") lsa = SVD(n_components=400) train = lsa.fit_transform(train) return train
def run_SVD(X,y,title): dims = list(np.arange(1,X.shape[1])) svd = SVD(random_state=10) for dim in dims: svd.set_params(n_components=dim) svd.fit_transform(X) ev = svd.explained_variance_ratio_ cum_var = np.cumsum(ev) fig, ax1 = plt.subplots() ax1.plot(dims, cum_var, 'b-') ax1.set_xlabel('Principal Components') # Make the y-axis label, ticks and tick labels match the line color. ax1.set_ylabel('Cumulative Explained Variance Ratio', color='b') ax1.tick_params('y', colors='b') plt.grid(False) ax2 = ax1.twinx() ax2.plot(dims, ev, 'm-') ax2.set_ylabel('Explained Variance Ratio', color='m') ax2.tick_params('y', colors='m') plt.grid(False) plt.title("SVD: "+ title) fig.tight_layout() plt.show()
def main(): if len(sys.argv) < 2: print('Expected arguments are not provided.') return movieid = int(sys.argv[1]) mlmovies = util.read_mlmovies() movie_actors = util.read_movie_actor() imdb_actor_info = util.read_imdb_actor_info() input_movie = mlmovies[mlmovies['movieid'] == movieid]['moviename'].values[0] actors_of_movie = movie_actors.where( movie_actors['movieid'] == movieid).dropna().loc[:, 'actorid'].unique() #print (actors_of_movie) movie_matrix = util.get_movie_tf_idf_matrix() actor_matrix = util.get_actor_tf_idf_matrix() #print(actor_matrix.shape) input_movie_vector = pd.DataFrame(movie_matrix.loc[movieid]) #.transpose() #print(input_movie_vector.shape) #similarity_matrix = actor_matrix.dot(input_movie_vector) #similarity_matrix = similarity_matrix[~similarity_matrix.index.isin(actors_of_movie)] #print(similarity_matrix) svd = SVD(n_components=no_of_components) svd.fit(movie_matrix) svd_movie_df = pd.DataFrame(svd.transform(movie_matrix), index=movie_matrix.index) input_movie_vector = pd.DataFrame(svd_movie_df.loc[movieid]) #print(input_movie_vector) df_components = pd.DataFrame(svd.components_, columns=actor_matrix.columns).transpose() #print(df_components.shape) #print(df_components) #print(actor_matrix.shape) projected_matrix = actor_matrix.dot(df_components) #print(projected_matrix) similarity_matrix = projected_matrix.dot(input_movie_vector) similarity_matrix = similarity_matrix[~similarity_matrix.index. isin(actors_of_movie)] #print(similarity_matrix) actors = [] for index, row in similarity_matrix.iterrows(): actor_name = imdb_actor_info[imdb_actor_info['id'] == index]['name'].values[0] actors.append( (index, actor_name, similarity_matrix.loc[index][movieid])) actors.sort(key=lambda tup: tup[2], reverse=True) #print (actors) util.print_output(movieid, input_movie, actors[:no_of_actors]) util.write_output_file(movieid, input_movie, actors[:no_of_actors], output_file)
def svd(table, k): """ Decompose table into matrices U . S . V = table. :param DataFrame table: table to decompose. :param int k: number of components to get from decomposition. :return DataFrame reduced matrix. """ matrix = SVD(n_components=k) out = matrix.fit_transform(table) return out, matrix.components_
def decompose_into_topics(document_concept_matrix, k_estimator=mdu.estimate_k_singular, l1_ratio=1, alpha=1, sparsity_scale=0.15, decompo='NMF', freq_threshold=0.85, normalize=True, mink=4): # We ignore concepts that are present too many times in this matrix # Or not present at all. Prescence is binary. binz = np.zeros_like(document_concept_matrix) binz[document_concept_matrix > 0] = 1 freqs = binz.sum(axis=0) num_docs = document_concept_matrix.shape[0] bad_cpts = np.nonzero(freqs > freq_threshold * num_docs)[0].tolist() bad_cpts += np.nonzero(freqs <= 0)[0].tolist() good_cpts = range(document_concept_matrix.shape[1]) good_cpts = [c for c in good_cpts if (c not in bad_cpts)] m_clean = document_concept_matrix[:, good_cpts] if isinstance(k_estimator, int): k = k_estimator else: k, gaps, deltas = k_estimator(m_clean, exp=10) k = max([k, np.ceil(num_docs / 100), mink]) print("\tk=" + str(k), end=" ") if decompo == 'NMF': model = NMF(n_components=int(k), init='random', random_state=0, l1_ratio=l1_ratio, alpha=alpha) else: model = SVD(n_components=int(k)) # print("\tC:", m_clean.shape) tdm = model.fit_transform(m_clean) topic_concept_matrix, threshold = mdu.remove_excess_nonzero( m_clean, model, good_cpts, numorigcpts=document_concept_matrix.shape[1], scale=sparsity_scale) tcm, tdm = mdu.remove_blank_topics(topic_concept_matrix, tdm) if normalize: rowsums = tcm.sum(axis=1) for top in range(tcm.shape[0]): tcm[top, :] = tcm[top, :] / rowsums[top] return tdm, tcm
def main(): err, input_movie_ids = util.parse_input(sys.argv) if err: return # *** Write your code here *** #process movie list to get matrix matrix = util.get_movie_matrix_from_hd5() #print(matrix) #perform SVD svd = SVD(n_components=no_of_components) svd.fit(matrix) svd_df = pd.DataFrame(svd.transform(matrix), index=matrix.index) input_movie_df = svd_df.loc[input_movie_ids] output_movies = [] for index, movie in svd_df.iterrows(): cosine_sum = 0 order = 1 for j, input_movie in input_movie_df.iterrows(): cosine_sum += (1 - cosine(movie, input_movie))*order order -= order_factor output_movies.append((index, cosine_sum)) other_movies = list(filter(lambda tup: tup[0] not in input_movie_ids, output_movies)) other_movies.sort(key=lambda tup: tup[1], reverse=True) output_movie_ids = [t[0] for t in other_movies][:5] #print output and log them feedback = util.process_output(input_movie_ids, output_movie_ids, output_file) #process feedback to get relevant movies and movies to be excluded relevant_movies, movie_to_exclude = util.process_feedback(feedback, input_movie_ids) relevant_movie_count = len(relevant_movies) #if all recommended movies are relevant then return if relevant_movie_count==5: print "\nAll the movies were relevant hence no modification to the suggestion" return #fetch data frames for relevant and feedback movies relevant_movies_df = svd_df.loc[relevant_movies] feedback_movies_df = svd_df.loc[feedback.keys()] modified_query = util.probabilistic_feedback_query(feedback_movies_df, relevant_movies_df, svd_df.index, relevant_movie_count) revised_movie_ids = util.get_revised_movies(svd_df, modified_query, movie_to_exclude) util.print_revised(revised_movie_ids, output_file)
def main(): store = pd.HDFStore('../task1/movie_final.h5') movie_matrix = store['df'] print(movie_matrix) svd = SVD(n_components=500) svd.fit(movie_matrix) movie_matrix_svd = pd.DataFrame(svd.transform(movie_matrix), index=movie_matrix.index) print(movie_matrix.shape) print(movie_matrix_svd.shape) movie_matrix_svd.to_pickle("movie_matrix_svd.pkl")
def main(): scores = load('scores', {}) PREPROCESSING = { 'raw': lambda: get_featureset(dataset, tf_idf=False), 'tfidf': lambda: get_featureset(dataset), 'nostem': lambda: get_featureset(dataset, stem=False), 'svd50': lambda: SVD(50).fit_transform(get_featureset(dataset)), 'svd100': lambda: SVD(100).fit_transform(get_featureset(dataset)), 'lda': lambda: get_lda(dataset, 100), } for dataset in DATASETS: for method in PREPROCESSING: # name of feature set name = method + ':' + dataset # check if we calulated this score before if name in scores: continue # we can only do iton some datasets if method == 'lda': if 'dataset' not in LDA_DATASETS: continue print(name) # get the data data = PREPROCESSING[method]() # train model scores[name] = get_scores(data) # save it save('scores', scores)
def do_svd(matrix, input_movie_ids): svd = SVD(n_components=no_of_components) svd.fit(matrix) svd_df = pd.DataFrame(svd.transform(matrix), index=matrix.index) input_movie_df = svd_df.loc[input_movie_ids] output_movies = {} for index, movie in svd_df.iterrows(): cosine_sum = 0 order = 1 for j, input_movie in input_movie_df.iterrows(): cosine_sum += (1 - cosine(movie, input_movie)) * order order -= order_factor output_movies[index] = cosine_sum return output_movies, svd_df
def pca(self, data, labels): #Calculate Mean mean = data.mean(axis=0) #Subtract mean from data data = np.subtract(data, mean) #Calculate Covariance cov = np.cov(data.T) #Calculate Eigen Values and Eigen Vectors evals, evecs = np.linalg.eig(cov) #Get Sorted Tuples tuples = self.sort(evals, evecs) # Choosing top 2 eigen vectors tuples = tuples[0:2] #Get the projection matrix projection_matrix = np.zeros((2, self.cols)) count = 0 for i in tuples: for k, v in i: for j in range(0, self.cols): projection_matrix[count][j] = v[j] count += 1 projection_matrix = projection_matrix.T #Get new_data pca_data = data.dot(projection_matrix) self.plot(pca_data, labels, self.labels, self.name, "PCA") #Using package TruncatedSVD svd = SVD(n_components=2) svd_data = svd.fit_transform(data) self.plot(svd_data, labels, self.labels, self.name, "SVD") tsne = TSNE(n_components=2, init="pca", learning_rate=100) tsne_data = tsne.fit_transform(data) self.plot(tsne_data, labels, self.labels, self.name, "tSNE")
def svd1(table, k): """ Decompose table into matrices U . S . V = table. :param DataFrame table: table to decompose. :param int k: number of components to get from decomposition. :return DataFrame reduced matrix. """ indexes = table.index matrix = SVD(n_components=k) out = matrix.fit_transform(table) temp = np.array(matrix.components_, dtype=float) l = 0 for i in temp: j = 0 print("LATENT SEMANTICS " + str(l)) for comp in i: print(str(j) + "\t" + str(comp)) j = j + 1 l = l + 1 return DataFrame(data=out, index=indexes, columns=range(k))
if not os.path.exists(args.Output_Folder): os.makedirs(args.Output_Folder) hasil = open('{}/{}'.format(args.Output_Folder, args.Output_File), 'w') n_topics = eval(args.n_topics) with open(args.Clean_Tweets_Location, 'rb') as handle: tfidf, tfidf_terms = pickle.load(handle) komponen = 5 if dimred == "grp": dr = GRP(n_components=komponen, random_state=11) elif dimred == 'srp': if UseKernel: dr = SRP(n_components=komponen, random_state=11) else: dr = SRP(n_components=komponen, random_state=11, dense_output=True) elif dimred == 'svd': dr = SVD(n_components=komponen, random_state=11) if dimred in ['lda', 'nmf']: if dimred == 'lda': dr = LDA(n_components=n_topics, random_state=11).fit(tfidf) else: dr = NMF(n_components=n_topics, random_state=11).fit(tfidf) cntr = dr.components_ else: tfile = open( "Time/{}".format(dimred.upper()) + UseKernel * "-Kernel" + "/{}".format(time_file), 'a+') timer = time() data = dr.fit_transform(tfidf) tfile.write("{}\n".format(time() - timer)) tfile.close() cntr, u, d = huft.fcmeans(data.T,
def svd(train, test, n_components): svd = SVD(n_components=n_components) new_train = svd.fit_transform(X=train) new_test = svd.transform(X=test) return new_train, new_test
import pandas as pd import matplotlib.pyplot as plt from sklearn.preprocessing import StandardScaler from sklearn.decomposition import TruncatedSVD as SVD import sklearn.model_selection as k r = pd.read_csv('bank_contacts.csv') x = r.drop('credit_application', axis=1) y = r['credit_application'] train_x, test_x, train_y, test_y = k.train_test_split(x, y, test_size=0.2, random_state=42) sc = StandardScaler() train_x = sc.fit_transform(train_x) test_x = sc.transform(test_x) svd = SVD(n_components=3, random_state=42) train_x = svd.fit_transform(train_x, train_y) test_x = svd.transform(test_x) from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(max_depth=2, random_state=0) classifier.fit(train_x, train_y) pred = classifier.predict(test_x) from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score print(confusion_matrix(test_y, pred)) print('Accuracy:', accuracy_score(test_y, pred)) plt.scatter(pred, test_x[:, 0], marker='o') plt.scatter(pred, test_x[:, 1], marker='o')
def task6(self, *args): """ Command:\ttask6 <k> Description:\tCreates a location - location similarity matrix, performs SVD, \ and reports the top k latent semantics. Arguments: \tK - Number of latent semantics to identify. """ if len(args) < 1: print("[ERROR] Not enough args were provided. Expected 1 but got " + str(len(args))) print("\targs = " + str(args)) return if len(args) > 1: print("[ERROR] Too many arguments were provided. Expected 1 but got " + str(len(args))) print("\targs = " + str(args)) return if not self.__database__: print("[ERROR] The Database must be loaded before this can be run.") print("\tCommand: load <filepath>") return try: k = int(args[0]) except: print("[ERROR] One or more arguments could not be parsed: " + str(args)) # First getting all the location tables having all the visual discriptors. # IMP NOTE: Remember to change all the static values after the test data arrives. all_location_tables = dict() for id in range(1,36): all_location_tables[id] = Database.get_vis_table(self.__database__,locationid=id) # Start finding the similarity between each pair of locations and store the results into a dictionary. if isfile('./all_similarities.npy'): all_similarities = np.load('all_similarities.npy').item() else: all_similarities = dict() for i in range(1,36): for j in range(i,36): cos_sim = cosine_similarity(all_location_tables[i], all_location_tables[j]) all_similarities[(i, j)] = Scoring.score_matrix(cos_sim) all_similarities[(j, i)] = all_similarities[(i, j)] np.save('all_similarities.npy', all_similarities) similarity_matrix = df(index=range(1,36),columns=range(1,36)) for i in range(1,36): sim_list = list() for j in range(1,36): sim_list.append(all_similarities[(i, j)]) similarity_matrix.loc[i] = sim_list similarity_matrix.to_csv('Task6_SimilarityMatrix.csv') print("Location-Location Similarity matrix created...") reduced = SVD(n_components=k) # reduced_table = reduced.fit_transform(similarity_matrix) reduced.fit(similarity_matrix) VTranspose = df(data=reduced.components_, index=range(1,k+1),columns=range(1,36)) VTranspose.to_csv('task6transposetable.csv') filename = 'devset_topics.xml' tree = ET.parse(filename) root = tree.getroot() location_name = dict() location_dict = dict() i = 0 while i < len(root): v1 = int(root[i][0].text) v2 = root[i][1].text location_name[v1] = v2 i += 1 print("Top k latent semantics in form of their location-weight pair are:") for _, row in VTranspose.iterrows(): for j in range(1, 36): loc = location_name[j] location_dict[loc] = row[j] sorted_location_dict = sorted(location_dict.items(), key=itemgetter(1), reverse=True) # print("latent semantic " + str(index) + " : " + str(sorted_location_dict)) print(f"LATENT SEMANTIC {i}") for key, value in sorted_location_dict: print(f"{key} : {value}")
def svd2(table, k): indexes = table.index matrix = SVD(n_components=k) out = matrix.fit_transform(table) return DataFrame(data=out, index=indexes, columns=range(k))
dataY = data.iloc[:, 41] features = list(dataX.columns.values) dataset = "QSAR" k = 10 k_em = 10 comp = 39 else: dataX = data.iloc[:, :20] dataY = data.iloc[:, 20] features = list(dataX.columns.values) dataset = "Voice" k = 10 k_em = 15 comp = 19 print("Running SVD for {}...".format(datst)) svd = SVD(n_components=comp, random_state=5) dataX_SVD = svd.fit_transform(dataX) #kmeans model = KMeans(n_clusters=k) labels = model.fit_predict(dataX_SVD) model = KMeans(n_clusters=k) labels_KM = model.fit_predict(dataX) accuracy = cluster_acc(dataY, labels) print("\nAccuracy for k-means on", dataset, "is", accuracy) accuracy_clusters = cluster_acc(labels_KM, labels) print("\nCluster alignment for k-means is", accuracy_clusters) #EM
cv = sklearn.feature_extraction.text.CountVectorizer() m = cv.fit_transform(['this is a document', 'this is a second document', 'third document document document']).todense() print(cv.vocabulary_) print(m) count_vectorizer = sklearn.feature_extraction.text.CountVectorizer() td_count_matrix = count_vectorizer.fit_transform(df['tags']) td_count_matrix.shape list(count_vectorizer.vocabulary_.items())[:10] from sklearn.decomposition import TruncatedSVD as SVD svd = SVD(10) svd.fit(td_count_matrix) for i in range(len(svd.components_)): topk = sorted(zip(svd.components_[i], count_vectorizer.get_feature_names()), reverse=True)[:10] print(' + '.join('{:.3f} * {}'.format(v, word) for v, word in topk)) # vectorize the documents count_vectorizer = sklearn.feature_extraction.text.CountVectorizer() td_count_matrix = count_vectorizer.fit_transform(df['description']) svd = SVD(10) svd.fit(td_count_matrix) #show just the words, not the scores this time for i in range(len(svd.components_)): topk = sorted(zip(svd.components_[i], count_vectorizer.get_feature_names()), reverse=True)[:10] print(' | '.join(w for _, w in topk))
(trainData, testData, trainTarget, testTarget) = split model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget) model = LinearSVC() model.fit(trainData, trainTarget) baseline = metrics.accuracy_score(model.predict(testData), testTarget) print("Running RP...") accuracies = [] for comp in comps: # create the random projection #sp = SparseRandomProjection(n_components = comp) #X = sp.fit_transform(trainData) #sp = PCA(n_components = comp, random_state=5) #X = sp.fit_transform(trainData) sp = SVD(n_components=comp) X = sp.fit_transform(trainData) # train a classifier on the sparse random projection model = LinearSVC() model.fit(X, trainTarget) # evaluate the model and update the list of accuracies test = sp.transform(testData) accuracies.append(metrics.accuracy_score(model.predict(test), testTarget)) plt.figure() plt.suptitle("Accuracy of Sparse Projection on {}".format(dataset)) plt.xlabel("# of Components") plt.ylabel("Accuracy") if dataset =="QSAR": plt.xlim([2, 40])
try: with open(fname) as pearl: text = pearl.read() token_dict[f] = re.sub("[^A-Za-z]", " ", text) except UnicodeDecodeError as e: with open(fname, encoding="utf8") as pearl: text = pearl.read() token_dict[f] = re.sub("[^A-Za-z]", " ", text) stopwords = stopwords.words("english") add = ['search', 'engine', 'web', 'internet'] stopwords.extend(add) tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words=stopwords) tfs = tfidf.fit_transform(token_dict.values()) lsa = SVD(n_components=4, n_iter=100) doc_top = lsa.fit_transform(tfs) doc_top = Normalizer(copy=False).fit_transform(doc_top) terms = tfidf.get_feature_names() for i, comp in enumerate(lsa.components_): termsInComp = zip(terms, comp) sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True)[:5] print("Topic %d:" % i) for term in sortedTerms: print(term[0]) print(" ") ##import umap ##X_topics = lsa.fit_transform(tfs) ##embedding = umap.UMAP(n_neighbors=150, min_dist=0.5, random_state=12).fit_transform(X_topics) ##
def run_EM_dr(X_norm,y,title): range_n_clusters = [2,3,4,5,6] loss_n = [] loss_pca = [] loss_ica = [] loss_srp = [] loss_svd = [] silhouette_avg0 = [] silhouette_avg1 = [] silhouette_avg2 = [] silhouette_avg3 = [] silhouette_avg4 = [] homo0 = [] homo1 = [] homo2 = [] homo3 = [] homo4 = [] comp0 = [] comp1 = [] comp2 = [] comp3 = [] comp4 = [] for index, n_clusters in enumerate(range_n_clusters): X_pca = PCA(n_components=3,random_state=5).fit_transform(X_norm) X_ica = FastICA(n_components=2,random_state=5).fit_transform(X_norm) X_srp = SRP(n_components=4,random_state=5).fit_transform(X_norm) X_svd = SVD(n_components=4,random_state=5).fit_transform(X_norm) clusterer_n = GaussianMixture(n_components=n_clusters, random_state=10).fit(X_norm) clusterer_pca = GaussianMixture(n_components=n_clusters, random_state=10).fit(X_pca) clusterer_ica = GaussianMixture(n_components=n_clusters, random_state=10).fit(X_ica) clusterer_srp = GaussianMixture(n_components=n_clusters, random_state=10).fit(X_srp) clusterer_svd = GaussianMixture(n_components=n_clusters, random_state=10).fit(X_svd) loss_n.append(clusterer_n.bic(X_norm)) loss_pca.append(clusterer_pca.aic(X_pca)) loss_ica.append(clusterer_ica.aic(X_ica)) loss_srp.append(clusterer_srp.aic(X_srp)) loss_svd.append(clusterer_svd.aic(X_svd)) cluster0_labels = clusterer_n.predict(X_norm) cluster1_labels = clusterer_pca.predict(X_pca) cluster2_labels = clusterer_ica.predict(X_ica) cluster3_labels = clusterer_srp.predict(X_srp) cluster4_labels = clusterer_svd.predict(X_svd) silhouette_avg0.append(silhouette_score(X_norm, cluster1_labels)) silhouette_avg1.append(silhouette_score(X_pca, cluster1_labels)) silhouette_avg2.append(silhouette_score(X_ica, cluster1_labels)) silhouette_avg3.append(silhouette_score(X_srp, cluster1_labels)) silhouette_avg4.append(silhouette_score(X_svd, cluster1_labels)) homo0.append(metrics.homogeneity_score(y, cluster0_labels)) homo1.append(metrics.homogeneity_score(y, cluster1_labels)) homo2.append(metrics.homogeneity_score(y, cluster2_labels)) homo3.append(metrics.homogeneity_score(y, cluster3_labels)) homo4.append(metrics.homogeneity_score(y, cluster4_labels)) comp0.append(metrics.completeness_score(y, cluster0_labels)) comp1.append(metrics.completeness_score(y, cluster1_labels)) comp2.append(metrics.completeness_score(y, cluster2_labels)) comp3.append(metrics.completeness_score(y, cluster3_labels)) comp4.append(metrics.completeness_score(y, cluster4_labels)) #BIC plt.plot(range_n_clusters, loss_n, label="without rd") plt.plot(range_n_clusters, loss_pca, label="pca") plt.plot(range_n_clusters, loss_ica, label="ica") plt.plot(range_n_clusters, loss_srp, label="srp") plt.plot(range_n_clusters, loss_svd, label="svd") plt.xlabel('number of cluster') plt.title('EM:BIC') plt.legend(loc="best") plt.show() #silhouette plt.plot(range_n_clusters, silhouette_avg0, label="without rd") plt.plot(range_n_clusters, silhouette_avg1, label="pca") plt.plot(range_n_clusters, silhouette_avg2, label="ica") plt.plot(range_n_clusters, silhouette_avg3, label="srp") plt.plot(range_n_clusters, silhouette_avg4, label="svd") plt.xlabel('number of cluster') plt.title('EM:silhouette') plt.legend(loc="best") plt.show() #homogeneity plt.plot(range_n_clusters, homo0, label="without rd") plt.plot(range_n_clusters, homo1, label="pca") plt.plot(range_n_clusters, homo2, label="ica") plt.plot(range_n_clusters, homo3, label="srp") plt.plot(range_n_clusters, homo4, label="svd") plt.xlabel('number of cluster') plt.title('EM:homogeneity') plt.legend(loc="best") plt.show() #completeness plt.plot(range_n_clusters, comp0, label="without rd") plt.plot(range_n_clusters, comp1, label="pca") plt.plot(range_n_clusters, comp2, label="ica") plt.plot(range_n_clusters, comp3, label="srp") plt.plot(range_n_clusters, comp4, label="svd") plt.xlabel('number of cluster') plt.title('EM:completeness') plt.legend(loc="best") plt.show()
gs.fit(labels_EM_RP.reshape(-1, 1), dataY) tmp = pd.DataFrame(gs.cv_results_) tmp.to_csv(out + 'QSAR NN EM RP.csv') best_indices = tmp.index[tmp['rank_test_score'] == 1].tolist() best_em = best_em.append( { 'Layers': str(tmp.iloc[best_indices[0], 4]), 'Iterations': tmp.iloc[best_indices[0], 5], 'Score': tmp.iloc[best_indices[0], 12] }, ignore_index=True) # Fit/transform with SVD print("Running SVD...") svd = SVD(n_components=3, random_state=5) dataX_SVD = svd.fit_transform(dataX) # Run KM print("Running k-means...") model = KMeans(n_clusters=km) labels_KM_SVD = model.fit_predict(dataX_SVD) grid = { 'NN__hidden_layer_sizes': nn_arch, 'NN__max_iter': nn_iter, 'NN__learning_rate_init': [0.016], 'NN__alpha': [0.316227766] } mlp = MLPClassifier(activation='relu', early_stopping=True,
print("{0}: {1}, with distance of {2}:".format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i])) ### collaborative filtering using matrix factorization ### us_canada_user_rating_pivot2 = us_canada_user_rating.pivot(index = 'userID', columns = 'bookTitle', values = 'bookRating').fillna(0) #print (us_canada_user_rating_pivot2.head()) #print (us_canada_user_rating_pivot2.shape) X = us_canada_user_rating_pivot2.values.T import sklearn from sklearn.decomposition import TruncatedSVD as SVD SVD = SVD(n_components =12, random_state = 17) matrix = SVD.fit_transform(X) #print (matrix.shape) import warnings warnings.filterwarnings("ignore", category = RuntimeWarning) corr = np.corrcoef(matrix) #print (corr.shape) us_canada_book_title = us_canada_user_rating_pivot2.columns us_canada_book_list = list(us_canada_book_title) coffey_hands = us_canada_book_list.index("The Green Mile: Coffey's Hands (Green Mile Series)") #print (coffey_hands)