def get_eigenstuff(self): self._step('Finding eigenvectors...') document_matrix = self.get_documents_matrix() theblend, study_concepts = self.get_blend() U, Sigma, V = theblend.normalize_all().svd(k=self.config('axes')) indices = [U.row_index(concept) for concept in study_concepts] reduced_U = U[indices] if self.is_associative(): doc_rows = divisi2.aligned_matrix_multiply(document_matrix, reduced_U) projections = reduced_U.extend(doc_rows) else: doc_indices = [V.row_index(doc.name) for doc in self.documents if doc.name in V.row_labels] projections = reduced_U.extend(V[doc_indices]) #if SUBTRACT_MEAN: # sdoc_indices = [projections.row_index(doc.name) for doc in # self.study_documents if doc.name in projections.row_labels] # projections -= np.asarray(projections[sdoc_indices]).mean(axis=0) if SUBTRACT_MEAN: projections -= np.asarray(projections).mean(axis=0) return document_matrix, projections, Sigma
def lab_color_for_text(self, concept): if concept in self.color_matrix.row_labels: return self.color_matrix.row_named(concept) starting_set = {} for subconcept in en.nl.extract_concepts(concept): if subconcept in self.colorfulness.labels: starting_set[subconcept] = self.colorfulness.entry_named(subconcept) if not starting_set: return divisi2.DenseVector([0,0,0,0], OrderedSet(["L", "a", "b", "colorful"])) category = divisi2.SparseVector.from_dict(starting_set) vector = self.spreading_activation.left_category(category) aligned_vector = vector[self.concept_label_map] for subconcept in en.nl.extract_concepts(concept): if subconcept in aligned_vector.labels: index = aligned_vector.index(subconcept) aligned_vector[index] += self.colorfulness.entry_named(subconcept) print aligned_vector.top_items() #aligned_vector /= numpy.sum(aligned_vector) #color = divisi2.dot(aligned_vector, self.smaller_color_matrix) sparse_vector = divisi2.SparseVector.from_named_entries([(value, key) for (key, value) in aligned_vector.top_items(10)]) sparse_vector /= (sparse_vector.vec_op(numpy.sum) + 0.000001) color = divisi2.aligned_matrix_multiply(sparse_vector, self.smaller_color_matrix) return divisi2.DenseVector(color, OrderedSet(["L", "a", "b", "colorful"]))
def get_eigenstuff(self): self._step('Finding eigenvectors...') document_matrix = self.get_documents_matrix() theblend, study_concepts = self.get_blend() U, Sigma, V = theblend.normalize_all().svd(k=self.config('axes')) indices = [U.row_index(concept) for concept in study_concepts] reduced_U = U[indices] if self.is_associative(): doc_rows = divisi2.aligned_matrix_multiply(document_matrix, reduced_U) projections = reduced_U.extend(doc_rows) else: doc_indices = [ V.row_index(doc.name) for doc in self.documents if doc.name in V.row_labels ] projections = reduced_U.extend(V[doc_indices]) #if SUBTRACT_MEAN: # sdoc_indices = [projections.row_index(doc.name) for doc in # self.study_documents if doc.name in projections.row_labels] # projections -= np.asarray(projections[sdoc_indices]).mean(axis=0) if SUBTRACT_MEAN: projections -= np.asarray(projections).mean(axis=0) return document_matrix, projections, Sigma
def createSpectralMatrix(): # proj is a ReconstructedMatrix of the form terms:terms. proj = divisi2.load('C:\Users\LLPadmin\Desktop\luminoso\ThaiFoodStudy\Results\spectral.rmat') # create sparse matrix for clusters of the form terms:clusters (row, col). clusterMatrix, cluster_names, term_names, termsDict = randomClustersMatrix(proj.col_labels, 10) count = 0 while True: count += 1 clusterMatrix = divisi2.aligned_matrix_multiply(proj.left, divisi2.aligned_matrix_multiply(proj.right,clusterMatrix)) repeat = normalize(clusterMatrix, termsDict, cluster_names, term_names) if repeat: print count break return clusterMatrix
def expand_study(study_name): study = StudyDirectory(study_name).get_study() theblend, concepts = study.get_assoc_blend() U, S, V = theblend.normalize_all().svd(k=50) doc_rows = divisi2.aligned_matrix_multiply(study.get_documents_matrix(), U) projections = U.extend(doc_rows) spectral = divisi2.reconstruct_activation(projections, S, post_normalize=True) divisi2.save(spectral, study_name+'/Results/expanded.rmat')
def createSpectralMatrix(k): # proj is a ReconstructedMatrix of the form terms:terms. proj = divisi2.load(os.path.abspath('../../ThaiFoodStudy')+'/Results/spectral.rmat') #proj = examples.spreading_activation() # create sparse matrix for clusters of the form terms:clusters (row, col). clusterMatrix, cluster_names, term_names, termsDict = randomClustersMatrix(proj.row_labels, k) count = 0 while True: count += 1 print count clusterMatrix = divisi2.aligned_matrix_multiply(proj.left, divisi2.aligned_matrix_multiply(proj.right,clusterMatrix)) repeat = normalize(clusterMatrix, termsDict, cluster_names, term_names) if repeat: print "Aftert "+str(count)+" iterations, we got acceptable clusters." break return clusterMatrix
def expand_study(study_name): study = StudyDirectory(study_name).get_study() theblend, concepts = study.get_assoc_blend() U, S, V = theblend.normalize_all().svd(k=50) doc_rows = divisi2.aligned_matrix_multiply(study.get_documents_matrix(), U) projections = U.extend(doc_rows) spectral = divisi2.reconstruct_activation(projections, S, post_normalize=True) divisi2.save(spectral, study_name + '/Results/expanded.rmat')
def createSpectralMatrix(k): # proj is a ReconstructedMatrix of the form terms:terms. proj = divisi2.load( os.path.abspath('../../ThaiFoodStudy') + '/Results/spectral.rmat') #proj = examples.spreading_activation() # create sparse matrix for clusters of the form terms:clusters (row, col). clusterMatrix, cluster_names, term_names, termsDict = randomClustersMatrix( proj.row_labels, k) count = 0 while True: count += 1 print count clusterMatrix = divisi2.aligned_matrix_multiply( proj.left, divisi2.aligned_matrix_multiply(proj.right, clusterMatrix)) repeat = normalize(clusterMatrix, termsDict, cluster_names, term_names) if repeat: print "Aftert " + str( count) + " iterations, we got acceptable clusters." break return clusterMatrix
def compute_stats(self, docs, spectral): """ Calculate statistics. Consistency: how tightly-clustered the documents are in the spectral decomposition space. Centrality: a Z-score for how "central" each concept and document is. Same general idea as "congruence" from Luminoso 1.0. """ if len(self.study_documents) <= 1: # consistency and centrality are undefined consistency = None centrality = None correlation = None core = None key_concepts = None c_centrality = None c_correlation = None else: # Determine which indices of the association matrix correspond to # documents. doc_indices = [spectral.row_index(doc.name) for doc in self.study_documents if doc.name in spectral.row_labels] valid_concepts = [c for c in spectral.row_labels if not c.endswith('.txt')] concept_indices = [spectral.row_index(c) for c in valid_concepts] # Make an ad hoc category of documents, then find how much each # document is associated with this average document. category_vec = divisi2.DenseVector(spectral.shape[0], spectral.row_labels) category_vec[doc_indices] = 1.0/len(doc_indices) all_assoc = spectral.left_category(category_vec) doc_assoc = all_assoc[doc_indices] # Calculate similarity statistics over all documents. doc_mean = np.mean(np.asarray(doc_assoc)) doc_stdev = np.std(np.asarray(doc_assoc)) doc_stderr = doc_stdev / np.sqrt(len(doc_indices)) # ...and over all concepts, though we may not need this. all_mean = np.mean(np.asarray(all_assoc)) all_stdev = np.std(np.asarray(all_assoc)) all_stderr = all_stdev / np.sqrt(spectral.shape[0],) consistency = doc_mean / doc_stderr centrality = divisi2.DenseVector((all_assoc - doc_mean) / doc_stderr, spectral.row_labels) correlation = divisi2.DenseVector(all_assoc / doc_stderr, spectral.row_labels) core = centrality.top_items(len(centrality)/2) core = [c[0] for c in core if c[0] in valid_concepts and c[1] > .001][:20] c_centrality = {} c_correlation = {} key_concepts = {} # the number of times each concept appears in each document doc_occur = self._documents_matrix # the average number of occurrences you expect of each document baseline = (1.0 + doc_occur.col_op(len)) / doc_occur.shape[0] for doc in self.canonical_documents: # record centrality and correlation for this document c_centrality[doc.name] = centrality.entry_named(doc.name) c_correlation[doc.name] = correlation.entry_named(doc.name) # find a weighted vector of similar documents docvec = np.maximum(0, spectral.row_named(doc.name)[doc_indices]) ** 3 docvec /= (0.0001 + np.sum(docvec)) keyvec = divisi2.aligned_matrix_multiply(docvec, doc_occur) assert not any(np.isnan(keyvec)) assert not any(np.isinf(keyvec)) interesting = spectral.row_named(doc.name)[concept_indices] #interesting = keyvec/baseline key_concepts[doc.name] = [] for key, val in interesting.top_items(5): if val > 0.0 and keyvec.entry_named(key) > 0.0: key_concepts[doc.name].append((key, keyvec.entry_named(key))) return { 'num_documents': self.num_documents, 'num_concepts': spectral.shape[0] - self.num_documents, 'consistency': consistency, 'centrality': c_centrality, 'correlation': c_correlation, 'key_concepts': key_concepts, 'core': core, 'timestamp': list(time.localtime()) }
work = [u'business', u'job'] religion = [u'faith', u'religion', u'church'] food = [u'food', u'coffee', u'wine', u'apple'] travel = [u'travel', u'traveling'] # curated, we start with these categories sportCat = divisi2.category(u'sport', u'basketball', u'soccer', u'entertainment', u'football', u'baseball', u'ski') artCat = divisi2.category(u'guitar', u'acoustic guitar', u'music', u'classical music', u'poetry', u'piano', u'jazz', u'art', u'dance', u'design') learningCat = divisi2.category(u'education', u'research', u'literature', u'news', u'science') moviesCat = divisi2.category(u'theater', u'cinema', u'television', u'movies', u'theatre') workCat = divisi2.category(u'business', u'job') religionCat = divisi2.category(u'faith', u'religion', u'church') foodCat = divisi2.category(u'food', u'coffee', u'wine', u'apple') travelCat = divisi2.category(u'travel', u'traveling') sport_features = divisi2.aligned_matrix_multiply(sport, matrix) sport_features.to_dense().top_items() sim.left_category(sport).top_items() sim.left_category(sport).entry_named('run') catList = [sport, art, learning, movies, work, religion, food, travel] catMatrix = [sportCat, artCat, learningCat, moviesCat, workCat, religionCat, foodCat, travelCat] catString = ['sport', 'art', 'learning', 'movies', 'work', 'religion', 'food', 'travel'] # removing interests we've already categorized needCat = [] usedCat = [] for cat in catList: for i in range(len(cat)): usedCat.append(cat[i]) for interest in found:
def compute_stats(self, docs, spectral): """ Calculate statistics. Consistency: how tightly-clustered the documents are in the spectral decomposition space. Centrality: a Z-score for how "central" each concept and document is. Same general idea as "congruence" from Luminoso 1.0. """ if len(self.study_documents) <= 1: # consistency and centrality are undefined consistency = None centrality = None correlation = None core = None key_concepts = None c_centrality = None c_correlation = None else: # Determine which indices of the association matrix correspond to # documents. doc_indices = [ spectral.row_index(doc.name) for doc in self.study_documents if doc.name in spectral.row_labels ] valid_concepts = [ c for c in spectral.row_labels if not c.endswith('.txt') ] concept_indices = [spectral.row_index(c) for c in valid_concepts] # Make an ad hoc category of documents, then find how much each # document is associated with this average document. category_vec = divisi2.DenseVector(spectral.shape[0], spectral.row_labels) category_vec[doc_indices] = 1.0 / len(doc_indices) all_assoc = spectral.left_category(category_vec) doc_assoc = all_assoc[doc_indices] # Calculate similarity statistics over all documents. doc_mean = np.mean(np.asarray(doc_assoc)) doc_stdev = np.std(np.asarray(doc_assoc)) doc_stderr = doc_stdev / np.sqrt(len(doc_indices)) # ...and over all concepts, though we may not need this. all_mean = np.mean(np.asarray(all_assoc)) all_stdev = np.std(np.asarray(all_assoc)) all_stderr = all_stdev / np.sqrt(spectral.shape[0], ) consistency = doc_mean / doc_stderr centrality = divisi2.DenseVector( (all_assoc - doc_mean) / doc_stderr, spectral.row_labels) correlation = divisi2.DenseVector(all_assoc / doc_stderr, spectral.row_labels) core = centrality.top_items(len(centrality) / 2) core = [ c[0] for c in core if c[0] in valid_concepts and c[1] > .001 ][:20] c_centrality = {} c_correlation = {} key_concepts = {} # the number of times each concept appears in each document doc_occur = self._documents_matrix # the average number of occurrences you expect of each document baseline = (1.0 + doc_occur.col_op(len)) / doc_occur.shape[0] for doc in self.canonical_documents: # record centrality and correlation for this document c_centrality[doc.name] = centrality.entry_named(doc.name) c_correlation[doc.name] = correlation.entry_named(doc.name) # find a weighted vector of similar documents docvec = np.maximum(0, spectral.row_named( doc.name)[doc_indices])**3 docvec /= (0.0001 + np.sum(docvec)) keyvec = divisi2.aligned_matrix_multiply(docvec, doc_occur) assert not any(np.isnan(keyvec)) assert not any(np.isinf(keyvec)) interesting = spectral.row_named(doc.name)[concept_indices] #interesting = keyvec/baseline key_concepts[doc.name] = [] for key, val in interesting.top_items(5): if val > 0.0 and keyvec.entry_named(key) > 0.0: key_concepts[doc.name].append( (key, keyvec.entry_named(key))) return { 'num_documents': self.num_documents, 'num_concepts': spectral.shape[0] - self.num_documents, 'consistency': consistency, 'centrality': c_centrality, 'correlation': c_correlation, 'key_concepts': key_concepts, 'core': core, 'timestamp': list(time.localtime()) }
def compute_stats(self, docs, spectral): """ Calculate statistics. Consistency: how tightly-clustered the documents are in the spectral decomposition space. Centrality: a Z-score for how "central" each concept and document is. Same general idea as "congruence" from Luminoso 1.0. """ if len(self.study_documents) <= 1: # consistency and centrality are undefined consistency = None centrality = None correlation = None core = None else: concept_sums = docs.col_op(np.sum) doc_indices = [spectral.left.row_index(doc.name) for doc in self.study_documents if doc.name in spectral.left.row_labels] # Compute the association of all study documents with each other assoc_grid = np.asarray(spectral[doc_indices, doc_indices].to_dense()) assert not np.any(np.isnan(assoc_grid)) assoc_list = [] for i in xrange(1, assoc_grid.shape[0]): assoc_list.extend(assoc_grid[i, :i]) reference_mean = np.mean(assoc_list) reference_stdev = np.std(assoc_list) reference_stderr = reference_stdev / len(doc_indices) consistency = reference_mean / reference_stderr ztest_stderr = reference_stdev / np.sqrt(len(doc_indices)) all_assoc = np.asarray(spectral[:, doc_indices].to_dense()) all_means = np.mean(all_assoc, axis=1) all_stdev = np.std(all_assoc, axis=1) all_stderr = all_stdev / np.sqrt(len(doc_indices)) centrality = divisi2.DenseVector((all_means - reference_mean) / ztest_stderr, spectral.row_labels) correlation = divisi2.DenseVector(all_means / ztest_stderr, spectral.row_labels) core = centrality.top_items(100) core = [c[0] for c in core if c[0] in concept_sums.labels and concept_sums.entry_named(c[0]) >= 2][:10] c_centrality = {} c_correlation = {} key_concepts = {} sdoc_indices = [spectral.col_index(sdoc.name) for sdoc in self.study_documents if sdoc.name in spectral.col_labels] doc_occur = np.abs(np.minimum(1, self._documents_matrix.to_dense())) baseline = (1.0 + np.sum(np.asarray(doc_occur), axis=0)) / doc_occur.shape[0] for doc in self.canonical_documents: c_centrality[doc.name] = centrality.entry_named(doc.name) c_correlation[doc.name] = correlation.entry_named(doc.name) docvec = np.maximum(0, spectral.row_named(doc.name)[sdoc_indices]) docvec /= (0.00001 + np.sum(docvec)) keyvec = divisi2.aligned_matrix_multiply(docvec, doc_occur) interesting = keyvec / baseline key_concepts[doc.name] = [] for key, val in interesting.top_items(5): key_concepts[doc.name].append((key, keyvec.entry_named(key))) return { 'num_documents': self.num_documents, 'num_concepts': spectral.shape[0] - self.num_documents, 'consistency': consistency, 'centrality': c_centrality, 'correlation': c_correlation, 'key_concepts': key_concepts, 'core': core, 'timestamp': list(time.localtime()) }
def categoryTopFeatures(self,category,n=20): category_features = divisi2.aligned_matrix_multiply(category, self.getSMatrix()) return [(x[0][1],self.getById(x[0][2]),x[1]) for x in category_features.to_dense().top_items(n)]