예제 #1
0
파일: model.py 프로젝트: blazarus/charmme
 def __init__(self, model_dir):
     """
     A LuminosoModel is constructed from `dir`, a path to a directory.
     This directory will contain saved versions of various matrices, as
     well as a SQLite database of terms and documents.
     """
     if not isinstance(model_dir, unicode):
         # Ensure that paths are unicode.
         model_dir = model_dir.decode(sys.getfilesystemencoding())
     if not os.access(model_dir, os.R_OK):
         raise IOError("Cannot read the study directory %s. "
                       "Use LuminosoModel.make() to make a new one."
                       % model_dir)
     self.dir = model_dir
     self._load_config()
     self._load_assoc()
     self.database = TermDatabase(
       self.filename_in_dir(LuminosoModel.DB_FILENAME)
     )
     self.connections_cache = {}
     self.idf_cache = {}
예제 #2
0
파일: model.py 프로젝트: blazarus/charmme
class LuminosoModel(object):
    """
    A LuminosoModel is a semantic space. You supply it with as many documents
    as possible from the domain of documents you intend to analyze, or
    possibly other forms of domain-specific knowledge.

    The LuminosoModel represents the semantic similarities between things
    as a Divisi2 reconstructed association matrix. This matrix can be
    updated incrementally to take new data into account, which is how
    Luminoso learns new domain-specific knowledge.
    """
    CONFIG_FILENAME = 'luminoso.cfg'
    ASSOC_FILENAME = 'associations.rmat'
    DB_FILENAME = 'terms.sqlite'

    def __init__(self, model_dir):
        """
        A LuminosoModel is constructed from `dir`, a path to a directory.
        This directory will contain saved versions of various matrices, as
        well as a SQLite database of terms and documents.
        """
        if not isinstance(model_dir, unicode):
            # Ensure that paths are unicode.
            model_dir = model_dir.decode(sys.getfilesystemencoding())
        if not os.access(model_dir, os.R_OK):
            raise IOError("Cannot read the study directory %s. "
                          "Use LuminosoModel.make() to make a new one."
                          % model_dir)
        self.dir = model_dir
        self._load_config()
        self._load_assoc()
        self.database = TermDatabase(
          self.filename_in_dir(LuminosoModel.DB_FILENAME)
        )
        self.connections_cache = {}
        self.idf_cache = {}
    
    def filename_in_dir(self, filename):
        """
        Given a filename relative to this LuminosoModel's directory, get its
        complete path.
        """
        return self.dir + os.sep + filename
    
    def file_exists_in_dir(self, filename):
        """
        Determine whether a file exists in this LuminosoModel's directory.
        """
        return os.access(self.filename_in_dir(filename), os.F_OK)

    def _load_config(self):
        "Load the configuration file."
        if self.file_exists_in_dir(LuminosoModel.CONFIG_FILENAME):
            self.config = Config(
              open(self.filename_in_dir(LuminosoModel.CONFIG_FILENAME))
            )
        else:
            raise IOError("This model is missing a config file.")

    def save_config(self):
        "Save the current configuration to the configuration file."
        save_config_file(
          self.config,
          self.filename_in_dir(LuminosoModel.CONFIG_FILENAME)
        )

    def save_canonical_stats(self, study='all'):
        """
        Given a study named 'foo', this saves its statistics to
        'foo.stats.json'.
        """
        stats = self.canonical_stats(study)
        out = codecs.open(self.filename_in_dir(study+'.stats.json'), 'w',
                          encoding='utf-8')
        json.dump(stats, out, indent=2, ensure_ascii=False)
        out.close()

    def _load_assoc(self):
        "Load the association matrix and priority queue from a file."
        if self.file_exists_in_dir(LuminosoModel.ASSOC_FILENAME):
            self.assoc = load_pickle(
                self.filename_in_dir(LuminosoModel.ASSOC_FILENAME)
            )
            assert isinstance(self.assoc, ReconstructedMatrix)
        else:
            raise IOError("This LuminosoModel does not have an "
                          "'associations.rmat' file. Use LuminosoModel.make() "
                          "to make a valid LuminosoModel.")
        self.assoc.make_symmetric()
        assert isinstance(self.assoc.row_labels, PrioritySet)

        self.priority = self.assoc.row_labels
        self.priority.listen_for_drops(self.on_drop)

    def save_assoc(self):
        "Save the association matrix to a file."
        save_pickle(self.assoc, 
                    self.filename_in_dir(LuminosoModel.ASSOC_FILENAME))
    
    def on_drop(self, index, key):
        """
        Handle when a key falls out of the PrioritySet.
        """
        self.assoc.left[index, :] = 0

    def add_document(self, doc, reader_name=None):
        """
        Take in a document, pass it through the reader, and store its terms
        in the term database.

        The document should be expressed as a dictionary, containing at least
        these keys:
        
        - name: the unique identifier for the document
        - text: the plain text of the document, possibly including text-encoded
          tags
        - url: a unique identifier for the document, preferably one that
          actually locates it relative to the study

        Optionally, it may contain:
        
        - tags: (key, value) tuples representing tags
        """
        LOG.info("Reading document: %r" % doc['url'])
        if reader_name is None:
            reader_name = self.config['reader']
        reader = get_reader(reader_name)
        text = doc['text']
        tags = doc.get('tags', [])
        doc_terms = []
        connections = list(reader.extract_connections(text))
        self.connections_cache[doc['url']] = connections
        for weight, term1, term2 in connections:
            if term1 == DOCUMENT:
                if isinstance(term2, tuple) and term2[0] == TAG:
                    tags.append(term2[1:])
                else:
                    doc_terms.append((term2, weight))
                    relevance = self.database.term_relevance(term2)
                    self.index_term(term2, relevance)

        doc['reader'] = reader_name
        doc['terms'] = doc_terms
        doc['tags'] = tags
        self.database.add_document(doc)
        self.idf_cache = {}   # invalidate the cache of term IDFs
        return doc['url']
    
    def get_document_connections(self, docid):
        """
        Given a previously added document, get the list of connections
        produced from it.
        """
        if docid in self.connections_cache:
            connections = self.connections_cache[docid]
        else:
            doc = self.database.get_document(docid)
            reader = get_reader(doc.reader)
            connections = list(reader.extract_connections(doc.text))
        return connections
    
    def get_document_terms(self, docid):
        """
        Given a previously added document, get the list of weighted terms
        that appear in it, as (term, weight) tuples.
        """
        return [(term2, weight) for (weight, term1, term2)
                in self.get_document_connections(docid)
                if term1 == DOCUMENT]
    
    def get_document_tags(self, docid):
        """
        Get the list of tags on a document from the database.
        """
        return self.database.get_document_tags(docid)
        
    def document_assoc_updates(self, docid):
        """
        Given a previously added document, yield triples to use to update the 
        association matrix.
        """
        LOG.info("Collecting connections from: %r" % docid)
        connections = self.get_document_connections(docid)
        for weight, term1, term2 in connections:
            if weight > 0 and term1 != DOCUMENT:
                norm_factor = ((self.database.count_term(term1) + 1)
                               * (self.database.count_term(term2) + 1)) ** .5
                yield weight/norm_factor, term1, term2
    
    def index_term(self, term, priority=None):
        """
        Ensure that a term is in both the database and the PrioritySet.
        If `priority` is specified, this will update its priority value.

        Returns the index of the term in the set.
        """
        index = self.priority.add(term)
        if priority:
            self.priority.update(term, priority)
        return index

    def get_term_idf(self, term):
        if term in self.idf_cache:
            return self.idf_cache[term]
        else:
            idf = self.database.term_idf(term)
            self.idf_cache[term] = idf
            return idf

    def learn_assoc(self, weight, term1, term2):
        """
        Learn the strength of the association between term1 and term2,
        both of which should exist in self.priority for efficiency's sake.
        For the purpose of testing, however, we can still add the terms.
        """
        try:
            row = self.priority.index(term1)
        except KeyError:
            row = self.priority.add(term1)
        try:
            col = self.priority.index(term2)
        except KeyError:
            col = self.priority.add(term2)

        mse = self.assoc.hebbian_increment(row, col, weight)
        return mse

    def learn_from(self, url, study=None, iterations=1):
        """
        Given a URL or file path that points to a collection of documents,
        learn from all of those documents. They may also be added to a
        study at the same time.

        Default to 1 iteration, because on a reasonable amount of data that
        will be all you need.
        """
        self.add_from_url(url, study, learn_iterations=iterations)
        self.save_assoc()
        self.save_config()
    # compatibility with previous docs
    learn_from_url = learn_from

    def add_from(self, url, study=None, learn_iterations=0, batch_size=1000):
        """
        Given a URL or file path that points to a collection of documents,
        add all the documents to the database. If `learn_iterations` is 0,
        the concept model will not change. When greater than 0, this 
        implements `learn_from_url`.

        This is the main loop that one should use to train a model with a
        batch of documents.
        """
        stream = handle_url(url, sample_frac=self.config.get('sample_frac', 1.0))
        batch = []
        while True:
            if len(batch) == batch_size:
                self.add_batch(lambda: iter(batch), study, learn_iterations)
                batch[:] = []
            try:
                batch.append(next(stream))
            except StopIteration:
                self.add_batch(lambda: iter(batch), study, learn_iterations)
                batch[:] = []
                break
    add_from_url = add_from

    def add_batch(self, stream_func, study=None, learn_iterations=0):
        """
        Add a batch of documents from some source, a `stream_func` that
        when called returns an iterator over the documents.
        """
        fulltext_cache = {}
        self.connections_cache = {}

        # First pass: add documents to the term database, and meanwhile
        # collect full texts and tags.
        for doc in stream_func():
            docid = self.add_document(doc)
            reader = get_reader(doc['reader'])
            for term, fulltext in reader.extract_term_texts(doc['text']):
                fulltext_cache[term] = fulltext
            if study is not None:
                self.database.set_tag_on_document(docid, 'study', study)
        
        LOG.info("Committing documents to the database")
        self.database.commit()

        LOG.info("Collecting relevant terms")
        self.database.update_relevance()

        if learn_iterations:
            # Second pass (optional): find how much we should update the
            # ReconstructedMatrix entries based on the word associations
            # we discover.
            learn_accumulator = defaultdict(float)
            for doc in stream_func():
                for weight, term1, term2\
                 in self.document_assoc_updates(doc['url']):
                    if term1 in self.priority and term2 in self.priority:
                        learn_accumulator[(term1, term2)] += weight

            # Now actually apply those total updates. Multiple times, if asked.
            total = len(learn_accumulator)
            for iter in xrange(learn_iterations):
                LOG.info("Updating association matrix: pass %d" % (iter+1))
                i = 0
                avg_err = 1.0
                for term1, term2 in learn_accumulator:
                    i += 1
                    if (i % 100) == 0:
                        LOG.info("Learned %d/%d; err=%4.4f"
                                 % (i, total, avg_err))
                    weight = learn_accumulator[(term1, term2)]
                    err = self.learn_assoc(weight, term1, term2)
                    avg_err = (.999 * avg_err) + (.001 * err)
        
        # Finally, update the full texts of the terms we saw.
        LOG.info("Updating full texts")
        for term, fulltext in fulltext_cache.items():
            self.database.set_term_text(term, fulltext)
        self.database.commit()
        
        # If this was a study, make a document matrix for it.
        if study is not None:
            LOG.info("Making document matrix for %r" % study)
            self.update_doc_matrix(study)
        LOG.info("Updating tag matrix")
        self.update_tag_matrix()
    
    def docs_in_study(self, study_name='all'):
        """
        Get a list of all documents in the given study.
        """
        return list(self.database.documents_with_tag_value(u'study',
                                                           study_name))

    def update_doc_matrix(self, study_name='all'):
        """
        Collect the documents in a particular study, and make a dense matrix
        from them representing their positions in this semantic space.

        FIXME: this filename may conflict with other things like 'tags'.
        """
        docs = self.docs_in_study(study_name)
        npmat = np.zeros((len(docs), self.config['num_axes']))
        dmat = divisi2.DenseMatrix(npmat, row_labels=docs)
        for docid in docs:
            row = dmat.row_index(docid)
            dmat[row] = self.vector_from_document(docid)
        divisi2.save(dmat, self.filename_in_dir(study_name+'.dmat'))

    def get_doc_matrix(self, study_name='all'):
        """
        Get the matrix of all documents in a particular study.
        """
        return divisi2.load(self.filename_in_dir(study_name+'.dmat'))

    def add_default_study(self, study_name='all'):
        """
        Ensure that every known document is in a study with the given name
        (default 'all'). Many methods for working with documents require a
        study name. This will help with experimenting with those methods
        on documents that weren't added as part of a study.
        """
        for doc in self.database.all_documents():
            self.database.set_tag_on_document(doc, 'study', study_name)

    def update_tag_matrix(self):
        """
        Collect the tags in a particular study, and make a dense matrix
        from them representing their average positions in this semantic space.
        """
        all_tags = self.database.all_tags()
        npmat = np.zeros((len(all_tags), self.config['num_axes']))
        dmat = divisi2.DenseMatrix(npmat, row_labels=all_tags)
        for key, value in all_tags:
            row = dmat.row_index((key, value))
            ndocs = 0
            for docid in self.database.documents_with_tag_value(key, value):
                dmat[row] += self.vector_from_document(docid)
                ndocs += 1
            if ndocs > 0:
                dmat[row] /= ndocs
        divisi2.save(dmat, self.filename_in_dir('tags.dmat'))
        self._tag_matrix = dmat
        return dmat
    
    def get_tag_matrix(self):
        """
        Get the matrix of all tags in a particular study.
        """
        if hasattr(self, '_tag_matrix'):
            return self._tag_matrix
        else:
            return divisi2.load(self.filename_in_dir('tags.dmat'))

    def export_svdview(self, study_name='all', num=10000):
        from divisi2.export_svdview import write_packed
        def denormalize(concept_text):
            doc = self.database.get_document(concept_text)
            if doc:
                return doc.name
            else:
                return concept_text
                #return self.database.get_term_text(concept_text)
        
        top_terms = [term.term for term in self.database.top_terms(num)]
        num = len(top_terms)
        term_mat = divisi2.DenseMatrix(
          np.zeros((num, self.config['num_axes'])),
          row_labels=top_terms
        )
        for i in xrange(num):
            term = top_terms[i]
            term_mat[i, :] = self.assoc.left[self.priority.index(term)]

        mat = term_mat.concatenate(self.get_doc_matrix(study_name))
        write_packed(mat, self.filename_in_dir(study_name), denormalize)

    def vector_from_terms(self, terms):
        """
        Get a category vector representing the given set of weighted terms,
        expressed as (term, weight) tuples. This will apply TF-IDF weighting.
        """
        total_weight = 0.0
        for _, weight in terms:
            total_weight += abs(weight)

        vec = divisi2.DenseVector(
            np.zeros((len(self.priority),)),
            labels=self.priority
        )
        for term, weight in terms:
            if term in self.priority:
                index = self.priority.index(term)
                tfidf_weight = weight * self.get_term_idf(term) * self.database.normalized_relevance(term)
                vec[index] = tfidf_weight / total_weight
        category = divisi2.dot(vec, self.assoc.left)
        return category

    def vector_from_text(self, text, reader_name=None):
        """
        Get a category vector in this model representing the given text,
        with TF-IDF applied.
        """
        if reader_name is None:
            reader_name = self.config['reader']
        reader = get_reader(reader_name)
        terms = []
        for weight, term1, term2 in reader.extract_connections(text):
            if term1 == DOCUMENT:
                terms.append((term2, weight))
        return self.vector_from_terms(terms)
    
    def vector_from_input(self):
        """
        Get a category vector representing the given line of input from
        standard in (which is a good way to enter Unicode that iPython can't
        deal with).
        """
        text = raw_input('> ')
        return self.vector_from_text(text.decode('utf-8'))

    def vector_from_document(self, doc_id):
        """
        Get a category vector for the given known document, with TF-IDF
        applied.
        """
        terms = self.get_document_terms(doc_id)
        return self.vector_from_terms(terms)
    
    def terms_similar_to_vector(self, vec):
        """
        Take in a category vector, and returns a weighted vector of
        associated terms. You can run the `top_items()` method of this vector
        to get the most associated terms.
        """
        return divisi2.dot(self.assoc.left, vec)
    
    def domain_terms_similar_to_vector(self, vec):
        """
        Take in a category vector, and returns a weighted vector of
        associated terms, but leave out ones that only appear in common
        sense background knowledge.

        You can run the `top_items()` method of this vector
        to get the most associated terms.
        """
        # FIXME: this way of finding domain concepts is such a hack.
        mask = np.zeros((len(self.priority),), 'b')
        for i, item in enumerate(self.priority.items):
            if (self.priority.priority.has_key(i) and
                self.priority.priority[i] < 1e6):
                mask[i] = True
        return divisi2.multiply(divisi2.dot(self.assoc.left.normalize_rows(offset=1.0), vec), mask)
    
    def docs_similar_to_vector(self, vec, study='all'):
        """
        Take in a category vector, and returns a weighted vector of
        associated documents in the study. You can run the `top_items()`
        method of this vector to get the most associated documents.
        """
        return divisi2.dot(self.get_doc_matrix(study).normalize_rows(offset=1.0), vec)

    def tags_similar_to_vector(self, vec):
        """
        Take in a category vector, and returns a weighted vector of
        associated tags in the study. You can run the `top_items()`
        method of this vector to get the most associated tags.
        """
        return divisi2.dot(self.get_tag_matrix().normalize_rows(offset=1.0), vec)
    
    def show_sim(self, similarities, n=10):
        """
        Display similar terms or documents in a human-readable form
        at the command line.
        """
        for name, value in similarities.top_items(n):
            doc = self.database.get_document(name)
            if doc:
                printable_name = doc.name
            else:
                printable_name = self.database.get_term_text(name)
            print "%40s  %+4.4f" % (printable_name[:40].encode('utf-8'), value)

    def canonical_stats(self, study='all', canonicals='Canonical'):
        """
        Get the correlation/centrality stats from a study, as compared to
        the documents in another study designated 'canonical'. That study
        is probably not a real study, it's just a set of documents like it's
        always been, but it's represented the same way.

        The default canonical study is in fact the one named 'Canonical'.

        TODO: average the documents as the study is being learned, allowing
        streaming and very large studies.
        """
        stats = {'correlation': {}, 'centrality': {}}

        # calculate the rms concept-concept similarity, as a scale factor
        mean_concept = np.mean(self.assoc.left_view, axis=0)
        concept_concept = np.dot(self.assoc.left_view, mean_concept)
        baseline = np.sqrt(np.mean(concept_concept ** 2))

        # np.asarray it so that we can apply numpy functions to it safely.
        study_matrix = np.asarray(self.get_doc_matrix(study))
        canonical_matrix = self.get_doc_matrix(canonicals)

        # First, find the (presumed) normal distribution for how much the
        # documents in this study are like each other. That distribution
        # (and particularly its mean) is called "consistency".
        mean_document = np.mean(np.asarray(study_matrix), axis=0)
        mean_doc_projections = np.dot(self.assoc.left_view, mean_document)

        stats['consistency'] = _mean_var_stats(mean_doc_projections, baseline)

        # Next, find a similar distribution for how much the documents in
        # this study are like each canonical document.

        for c_row in xrange(canonical_matrix.shape[0]):
            canonical_id = canonical_matrix.row_label(c_row)
            canonical_vec = np.asarray(canonical_matrix[c_row, :])
            canonical_projections = np.dot(self.assoc.left_view,
                                           canonical_vec)
            correlation_stats = _mean_var_stats(canonical_projections,
                                                baseline)
            stats['correlation'][canonical_id] = correlation_stats

            centrality = ((correlation_stats['mean'] -
                           stats['consistency']['mean'])
                          / correlation_stats['stderr'])
            stats['centrality'][canonical_id] = centrality
        return stats

    def __repr__(self):
        return "<LuminosoModel: %r>" % self.dir

    @staticmethod
    def make(model_dir, orig_dmat, config):
        """
        Make a new LuminosoModel in the (nonexistent) directory `dir`,
        with initial half-association matrix `orig_dmat`. (A half-association
        matrix is a matrix that gives an association matrix when it is
        multiplied by its transpose.)
        """
        # Adjust the size of the matrix to match the config, if necessary.
        if os.access(model_dir, os.F_OK):
            raise StudyExists("The model directory %r already exists." % model_dir)
        rows = config['num_concepts']
        cols = config['num_axes']
        if orig_dmat.shape != (rows, cols):
            dmat = divisi2.DenseMatrix((rows, cols))
            rows_to_copy = orig_dmat.shape[0]
            if rows < rows_to_copy:
                raise ValueError("num_concepts is too small to fit the "
                                 "existing concepts.")
            cols_to_copy = min(cols, orig_dmat.shape[1])
            dmat[:rows_to_copy, :cols_to_copy] = \
              orig_dmat[:rows_to_copy, :cols_to_copy]
            dmat.row_labels = orig_dmat.row_labels
        else:
            dmat = orig_dmat
        
        # Make sure that the matrix has a PrioritySet for its row labels.
        _prioritize_labels(dmat, rows)    
        rmat = divisi2.reconstruct_symmetric(dmat)

        # Make the model directory and populate its initial files.
        os.mkdir(model_dir)
        rmat_file = model_dir + os.sep + LuminosoModel.ASSOC_FILENAME
        config_file = model_dir + os.sep + LuminosoModel.CONFIG_FILENAME
        save_pickle(rmat, rmat_file)
        save_config_file(config, config_file)

        # Now load the model from that directory and return it.
        model = LuminosoModel(model_dir)
        return model

    @staticmethod
    def make_empty(model_dir, config=None):
        """
        Make a LuminosoModel that starts from an empty matrix.
        """
        if config is None:
            config = _default_config()
        mat = divisi2.DenseMatrix((config['num_concepts'], config['num_axes']))
        model = LuminosoModel.make(model_dir, mat, config)
        return model

    @staticmethod
    def make_english(model_dir, config=None):
        """
        Make a LuminosoModel whose initial matrix contains common sense
        in English.
        """
        return LuminosoModel.make_common_sense(model_dir, 'en', config)

    @staticmethod
    def make_japanese(model_dir, config=None):
        """
        Make a LuminosoModel whose initial matrix contains common sense
        in Japanese.
        """
        return LuminosoModel.make_common_sense(model_dir, 'ja', config)

    @staticmethod
    def make_common_sense(model_dir, lang='en', config=None):
        """
        Make a LuminosoModel whose initial matrix contains common sense
        for some language.
        """
        if config is None:
            config = _default_config()
            config['reader'] = 'simplenlp.'+lang
        if os.access(model_dir, os.F_OK):
            raise StudyExists("The model directory %r already exists." % model_dir)
        LOG.info("Making common sense matrix")
        assoc = divisi2.network.conceptnet_assoc(lang)
        (mat_U, diag_S, _) = assoc.normalize_all().svd(k=100)
        rmat = divisi2.reconstruct_activation(
            mat_U, diag_S, post_normalize=True
        )
        model = LuminosoModel.make(model_dir, rmat.left, config)
        model.config['iteration'] = 1000
        return model