示例#1
0
    def get_eigenstuff(self):
        self._step('Finding eigenvectors...')
        document_matrix = self.get_documents_matrix()
        theblend, study_concepts = self.get_blend()
        U, Sigma, V = theblend.normalize_all().svd(k=self.config('axes'))
        indices = [U.row_index(concept) for concept in study_concepts]
        reduced_U = U[indices]
        if self.is_associative():
            doc_rows = divisi2.aligned_matrix_multiply(document_matrix, reduced_U)
            projections = reduced_U.extend(doc_rows)

        else:
            doc_indices = [V.row_index(doc.name)
                           for doc in self.documents
                           if doc.name in V.row_labels]
            projections = reduced_U.extend(V[doc_indices])
        
        #if SUBTRACT_MEAN:
        #    sdoc_indices = [projections.row_index(doc.name) for doc in
        #    self.study_documents if doc.name in projections.row_labels]
        #    projections -= np.asarray(projections[sdoc_indices]).mean(axis=0)
        if SUBTRACT_MEAN:
            projections -= np.asarray(projections).mean(axis=0)

        return document_matrix, projections, Sigma
示例#2
0
    def lab_color_for_text(self, concept):
        if concept in self.color_matrix.row_labels:
            return self.color_matrix.row_named(concept)
        starting_set = {}
        for subconcept in en.nl.extract_concepts(concept):
            if subconcept in self.colorfulness.labels:
                starting_set[subconcept] = self.colorfulness.entry_named(subconcept)
        if not starting_set:
            return divisi2.DenseVector([0,0,0,0], OrderedSet(["L", "a", "b", "colorful"]))
        category = divisi2.SparseVector.from_dict(starting_set)
        vector = self.spreading_activation.left_category(category)
        aligned_vector = vector[self.concept_label_map]
        for subconcept in en.nl.extract_concepts(concept):
            if subconcept in aligned_vector.labels:
                index = aligned_vector.index(subconcept)
                aligned_vector[index] += self.colorfulness.entry_named(subconcept)
        print aligned_vector.top_items()
        #aligned_vector /= numpy.sum(aligned_vector)
        #color = divisi2.dot(aligned_vector, self.smaller_color_matrix)
        
        sparse_vector = divisi2.SparseVector.from_named_entries([(value, key) for (key, value) in aligned_vector.top_items(10)])
        sparse_vector /= (sparse_vector.vec_op(numpy.sum) + 0.000001)
        color = divisi2.aligned_matrix_multiply(sparse_vector, self.smaller_color_matrix)

        return divisi2.DenseVector(color, OrderedSet(["L", "a", "b", "colorful"]))
示例#3
0
    def get_eigenstuff(self):
        self._step('Finding eigenvectors...')
        document_matrix = self.get_documents_matrix()
        theblend, study_concepts = self.get_blend()
        U, Sigma, V = theblend.normalize_all().svd(k=self.config('axes'))
        indices = [U.row_index(concept) for concept in study_concepts]
        reduced_U = U[indices]
        if self.is_associative():
            doc_rows = divisi2.aligned_matrix_multiply(document_matrix,
                                                       reduced_U)
            projections = reduced_U.extend(doc_rows)

        else:
            doc_indices = [
                V.row_index(doc.name) for doc in self.documents
                if doc.name in V.row_labels
            ]
            projections = reduced_U.extend(V[doc_indices])

        #if SUBTRACT_MEAN:
        #    sdoc_indices = [projections.row_index(doc.name) for doc in
        #    self.study_documents if doc.name in projections.row_labels]
        #    projections -= np.asarray(projections[sdoc_indices]).mean(axis=0)
        if SUBTRACT_MEAN:
            projections -= np.asarray(projections).mean(axis=0)

        return document_matrix, projections, Sigma
def createSpectralMatrix():
    # proj is a ReconstructedMatrix of the form terms:terms.
    proj = divisi2.load('C:\Users\LLPadmin\Desktop\luminoso\ThaiFoodStudy\Results\spectral.rmat')

    # create sparse matrix for clusters of the form terms:clusters (row, col).
    clusterMatrix, cluster_names, term_names, termsDict = randomClustersMatrix(proj.col_labels, 10)
    count = 0

    while True:
        count += 1
        clusterMatrix = divisi2.aligned_matrix_multiply(proj.left, divisi2.aligned_matrix_multiply(proj.right,clusterMatrix))
        repeat = normalize(clusterMatrix, termsDict, cluster_names, term_names)
        if repeat:
            print count
            break
    
    return clusterMatrix
示例#5
0
def expand_study(study_name):
    study = StudyDirectory(study_name).get_study()
    theblend, concepts = study.get_assoc_blend()
    U, S, V = theblend.normalize_all().svd(k=50)
    doc_rows = divisi2.aligned_matrix_multiply(study.get_documents_matrix(), U)
    projections = U.extend(doc_rows)
    spectral = divisi2.reconstruct_activation(projections, S, post_normalize=True)
    divisi2.save(spectral, study_name+'/Results/expanded.rmat')
def createSpectralMatrix(k):
    # proj is a ReconstructedMatrix of the form terms:terms.
    proj = divisi2.load(os.path.abspath('../../ThaiFoodStudy')+'/Results/spectral.rmat')
    #proj = examples.spreading_activation()

    # create sparse matrix for clusters of the form terms:clusters (row, col).
    clusterMatrix, cluster_names, term_names, termsDict = randomClustersMatrix(proj.row_labels, k)
    count = 0
    while True:
        count += 1
        print count
        clusterMatrix = divisi2.aligned_matrix_multiply(proj.left, divisi2.aligned_matrix_multiply(proj.right,clusterMatrix))
        repeat = normalize(clusterMatrix, termsDict, cluster_names, term_names)
        if repeat:
            print "Aftert "+str(count)+" iterations, we got acceptable clusters."
            break
    
    return clusterMatrix
示例#7
0
def expand_study(study_name):
    study = StudyDirectory(study_name).get_study()
    theblend, concepts = study.get_assoc_blend()
    U, S, V = theblend.normalize_all().svd(k=50)
    doc_rows = divisi2.aligned_matrix_multiply(study.get_documents_matrix(), U)
    projections = U.extend(doc_rows)
    spectral = divisi2.reconstruct_activation(projections,
                                              S,
                                              post_normalize=True)
    divisi2.save(spectral, study_name + '/Results/expanded.rmat')
def createSpectralMatrix(k):
    # proj is a ReconstructedMatrix of the form terms:terms.
    proj = divisi2.load(
        os.path.abspath('../../ThaiFoodStudy') + '/Results/spectral.rmat')
    #proj = examples.spreading_activation()

    # create sparse matrix for clusters of the form terms:clusters (row, col).
    clusterMatrix, cluster_names, term_names, termsDict = randomClustersMatrix(
        proj.row_labels, k)
    count = 0
    while True:
        count += 1
        print count
        clusterMatrix = divisi2.aligned_matrix_multiply(
            proj.left,
            divisi2.aligned_matrix_multiply(proj.right, clusterMatrix))
        repeat = normalize(clusterMatrix, termsDict, cluster_names, term_names)
        if repeat:
            print "Aftert " + str(
                count) + " iterations, we got acceptable clusters."
            break

    return clusterMatrix
示例#9
0
    def compute_stats(self, docs, spectral):
        """
        Calculate statistics.

        Consistency: how tightly-clustered the documents are in the spectral
        decomposition space.

        Centrality: a Z-score for how "central" each concept and document
        is. Same general idea as "congruence" from Luminoso 1.0.
        """

        if len(self.study_documents) <= 1:
            # consistency and centrality are undefined
            consistency = None
            centrality = None
            correlation = None
            core = None
            key_concepts = None
            c_centrality = None
            c_correlation = None
        else:
            # Determine which indices of the association matrix correspond to
            # documents.
            doc_indices = [spectral.row_index(doc.name)
                           for doc in self.study_documents
                           if doc.name in spectral.row_labels]
            valid_concepts = [c for c in spectral.row_labels if not c.endswith('.txt')]
            concept_indices = [spectral.row_index(c) for c in valid_concepts]
            
            # Make an ad hoc category of documents, then find how much each
            # document is associated with this average document.
            category_vec = divisi2.DenseVector(spectral.shape[0], spectral.row_labels)
            category_vec[doc_indices] = 1.0/len(doc_indices)
            all_assoc = spectral.left_category(category_vec)
            doc_assoc = all_assoc[doc_indices]
            
            # Calculate similarity statistics over all documents.
            doc_mean = np.mean(np.asarray(doc_assoc))
            doc_stdev = np.std(np.asarray(doc_assoc))
            doc_stderr = doc_stdev / np.sqrt(len(doc_indices))

            # ...and over all concepts, though we may not need this.
            all_mean = np.mean(np.asarray(all_assoc))
            all_stdev = np.std(np.asarray(all_assoc))
            all_stderr = all_stdev / np.sqrt(spectral.shape[0],)

            consistency = doc_mean / doc_stderr
            centrality = divisi2.DenseVector((all_assoc - doc_mean) / doc_stderr, spectral.row_labels)
            correlation = divisi2.DenseVector(all_assoc / doc_stderr, spectral.row_labels)
            core = centrality.top_items(len(centrality)/2)
            core = [c[0] for c in core
                    if c[0] in valid_concepts
                    and c[1] > .001][:20]

            c_centrality = {}
            c_correlation = {}
            key_concepts = {}
            
            # the number of times each concept appears in each document
            doc_occur = self._documents_matrix

            # the average number of occurrences you expect of each document
            baseline = (1.0 + doc_occur.col_op(len)) / doc_occur.shape[0]
            for doc in self.canonical_documents:
                # record centrality and correlation for this document
                c_centrality[doc.name] = centrality.entry_named(doc.name)
                c_correlation[doc.name] = correlation.entry_named(doc.name)

                # find a weighted vector of similar documents
                docvec = np.maximum(0, spectral.row_named(doc.name)[doc_indices]) ** 3
                docvec /= (0.0001 + np.sum(docvec))
                keyvec = divisi2.aligned_matrix_multiply(docvec, doc_occur)

                assert not any(np.isnan(keyvec))
                assert not any(np.isinf(keyvec))
                interesting = spectral.row_named(doc.name)[concept_indices]
                #interesting = keyvec/baseline
                key_concepts[doc.name] = []
                for key, val in interesting.top_items(5):
                    if val > 0.0 and keyvec.entry_named(key) > 0.0:
                        key_concepts[doc.name].append((key, keyvec.entry_named(key)))
        
        return {
            'num_documents': self.num_documents,
            'num_concepts': spectral.shape[0] - self.num_documents,
            'consistency': consistency,
            'centrality': c_centrality,
            'correlation': c_correlation,
            'key_concepts': key_concepts,
            'core': core,
            'timestamp': list(time.localtime())
        }
示例#10
0
work = [u'business', u'job']
religion = [u'faith', u'religion', u'church']
food = [u'food', u'coffee', u'wine', u'apple']
travel = [u'travel', u'traveling']

# curated, we start with these categories
sportCat = divisi2.category(u'sport', u'basketball', u'soccer', u'entertainment', u'football', u'baseball', u'ski')
artCat = divisi2.category(u'guitar', u'acoustic guitar', u'music', u'classical music', u'poetry', u'piano', u'jazz',  u'art', u'dance', u'design')
learningCat = divisi2.category(u'education', u'research', u'literature', u'news', u'science')
moviesCat = divisi2.category(u'theater', u'cinema', u'television', u'movies', u'theatre')
workCat = divisi2.category(u'business', u'job')
religionCat = divisi2.category(u'faith', u'religion', u'church')
foodCat = divisi2.category(u'food', u'coffee', u'wine', u'apple')
travelCat = divisi2.category(u'travel', u'traveling')

sport_features = divisi2.aligned_matrix_multiply(sport, matrix)
sport_features.to_dense().top_items()
sim.left_category(sport).top_items()
sim.left_category(sport).entry_named('run')

catList = [sport, art, learning, movies, work, religion, food, travel]
catMatrix = [sportCat, artCat, learningCat, moviesCat, workCat, religionCat, foodCat, travelCat]
catString = ['sport', 'art', 'learning', 'movies', 'work', 'religion', 'food', 'travel']

# removing interests we've already categorized
needCat = []
usedCat = []
for cat in catList:
	for i in range(len(cat)):
		usedCat.append(cat[i])
for interest in found:
示例#11
0
    def compute_stats(self, docs, spectral):
        """
        Calculate statistics.

        Consistency: how tightly-clustered the documents are in the spectral
        decomposition space.

        Centrality: a Z-score for how "central" each concept and document
        is. Same general idea as "congruence" from Luminoso 1.0.
        """

        if len(self.study_documents) <= 1:
            # consistency and centrality are undefined
            consistency = None
            centrality = None
            correlation = None
            core = None
            key_concepts = None
            c_centrality = None
            c_correlation = None
        else:
            # Determine which indices of the association matrix correspond to
            # documents.
            doc_indices = [
                spectral.row_index(doc.name) for doc in self.study_documents
                if doc.name in spectral.row_labels
            ]
            valid_concepts = [
                c for c in spectral.row_labels if not c.endswith('.txt')
            ]
            concept_indices = [spectral.row_index(c) for c in valid_concepts]

            # Make an ad hoc category of documents, then find how much each
            # document is associated with this average document.
            category_vec = divisi2.DenseVector(spectral.shape[0],
                                               spectral.row_labels)
            category_vec[doc_indices] = 1.0 / len(doc_indices)
            all_assoc = spectral.left_category(category_vec)
            doc_assoc = all_assoc[doc_indices]

            # Calculate similarity statistics over all documents.
            doc_mean = np.mean(np.asarray(doc_assoc))
            doc_stdev = np.std(np.asarray(doc_assoc))
            doc_stderr = doc_stdev / np.sqrt(len(doc_indices))

            # ...and over all concepts, though we may not need this.
            all_mean = np.mean(np.asarray(all_assoc))
            all_stdev = np.std(np.asarray(all_assoc))
            all_stderr = all_stdev / np.sqrt(spectral.shape[0], )

            consistency = doc_mean / doc_stderr
            centrality = divisi2.DenseVector(
                (all_assoc - doc_mean) / doc_stderr, spectral.row_labels)
            correlation = divisi2.DenseVector(all_assoc / doc_stderr,
                                              spectral.row_labels)
            core = centrality.top_items(len(centrality) / 2)
            core = [
                c[0] for c in core if c[0] in valid_concepts and c[1] > .001
            ][:20]

            c_centrality = {}
            c_correlation = {}
            key_concepts = {}

            # the number of times each concept appears in each document
            doc_occur = self._documents_matrix

            # the average number of occurrences you expect of each document
            baseline = (1.0 + doc_occur.col_op(len)) / doc_occur.shape[0]
            for doc in self.canonical_documents:
                # record centrality and correlation for this document
                c_centrality[doc.name] = centrality.entry_named(doc.name)
                c_correlation[doc.name] = correlation.entry_named(doc.name)

                # find a weighted vector of similar documents
                docvec = np.maximum(0,
                                    spectral.row_named(
                                        doc.name)[doc_indices])**3
                docvec /= (0.0001 + np.sum(docvec))
                keyvec = divisi2.aligned_matrix_multiply(docvec, doc_occur)

                assert not any(np.isnan(keyvec))
                assert not any(np.isinf(keyvec))
                interesting = spectral.row_named(doc.name)[concept_indices]
                #interesting = keyvec/baseline
                key_concepts[doc.name] = []
                for key, val in interesting.top_items(5):
                    if val > 0.0 and keyvec.entry_named(key) > 0.0:
                        key_concepts[doc.name].append(
                            (key, keyvec.entry_named(key)))

        return {
            'num_documents': self.num_documents,
            'num_concepts': spectral.shape[0] - self.num_documents,
            'consistency': consistency,
            'centrality': c_centrality,
            'correlation': c_correlation,
            'key_concepts': key_concepts,
            'core': core,
            'timestamp': list(time.localtime())
        }
示例#12
0
文件: study.py 项目: rafacb/luminoso
    def compute_stats(self, docs, spectral):
        """
        Calculate statistics.

        Consistency: how tightly-clustered the documents are in the spectral
        decomposition space.

        Centrality: a Z-score for how "central" each concept and document
        is. Same general idea as "congruence" from Luminoso 1.0.
        """

        if len(self.study_documents) <= 1:
            # consistency and centrality are undefined
            consistency = None
            centrality = None
            correlation = None
            core = None
        else:
            concept_sums = docs.col_op(np.sum)
            doc_indices = [spectral.left.row_index(doc.name)
                           for doc in self.study_documents
                           if doc.name in spectral.left.row_labels]
            
            # Compute the association of all study documents with each other
            assoc_grid = np.asarray(spectral[doc_indices, doc_indices].to_dense())
            assert not np.any(np.isnan(assoc_grid))
            assoc_list = []
            for i in xrange(1, assoc_grid.shape[0]):
                assoc_list.extend(assoc_grid[i, :i])

            reference_mean = np.mean(assoc_list)
            reference_stdev = np.std(assoc_list)
            reference_stderr = reference_stdev / len(doc_indices)
            consistency = reference_mean / reference_stderr

            ztest_stderr = reference_stdev / np.sqrt(len(doc_indices))

            all_assoc = np.asarray(spectral[:, doc_indices].to_dense())
            all_means = np.mean(all_assoc, axis=1)
            all_stdev = np.std(all_assoc, axis=1)
            all_stderr = all_stdev / np.sqrt(len(doc_indices))
            centrality = divisi2.DenseVector((all_means - reference_mean) /
              ztest_stderr, spectral.row_labels)
            correlation = divisi2.DenseVector(all_means / ztest_stderr,
              spectral.row_labels)
            core = centrality.top_items(100)
            core = [c[0] for c in core
                    if c[0] in concept_sums.labels
                    and concept_sums.entry_named(c[0]) >= 2][:10]

            c_centrality = {}
            c_correlation = {}
            key_concepts = {}
            sdoc_indices = [spectral.col_index(sdoc.name)
                            for sdoc in self.study_documents
                            if sdoc.name in spectral.col_labels]
            doc_occur = np.abs(np.minimum(1, self._documents_matrix.to_dense()))
            baseline = (1.0 + np.sum(np.asarray(doc_occur),
              axis=0)) / doc_occur.shape[0]
            for doc in self.canonical_documents:
                c_centrality[doc.name] = centrality.entry_named(doc.name)
                c_correlation[doc.name] = correlation.entry_named(doc.name)
                docvec = np.maximum(0, spectral.row_named(doc.name)[sdoc_indices])
                docvec /= (0.00001 + np.sum(docvec))
                keyvec = divisi2.aligned_matrix_multiply(docvec, doc_occur)
                interesting = keyvec / baseline
                key_concepts[doc.name] = []
                for key, val in interesting.top_items(5):
                    key_concepts[doc.name].append((key, keyvec.entry_named(key)))
        
        return {
            'num_documents': self.num_documents,
            'num_concepts': spectral.shape[0] - self.num_documents,
            'consistency': consistency,
            'centrality': c_centrality,
            'correlation': c_correlation,
            'key_concepts': key_concepts,
            'core': core,
            'timestamp': list(time.localtime())
        }
示例#13
0
 def categoryTopFeatures(self,category,n=20):
     category_features = divisi2.aligned_matrix_multiply(category, self.getSMatrix())
     return [(x[0][1],self.getById(x[0][2]),x[1]) for x in category_features.to_dense().top_items(n)]