示例#1
0
    def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000):
        is_corpus, current_representation = utils.is_corpus(current_representation)
        if is_corpus:
            for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)):
                ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk))
                assert num_terms is not None, "Need num_terms to properly handle sparse corpus format"
                chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms)

                ln.debug("Chunk converted to csc, running through layer..")
                chunk_trans = layer.__getitem__(chunk_as_csc)

                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("Finished serializing chunk. Processed %s documents so far." %
                         (chunk_no * chunksize + len(chunk)))
        else:
            ln.info("Beginning serialization of non-gensim corpus format intermediate representation.")
            ln.debug("Type of current_representation is %s" % type(current_representation))
            for chunk_no, chunk in enumerate(current_representation):
                ln.debug("converting chunk (%s documents)..." % chunksize)
                chunk_trans = layer.__getitem__(chunk)
                ln.debug("Serializing hidden representation..")
                fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no))
                np.save(fname, chunk_trans)
                ln.debug("finished serializing chunk.")

        ln.info("Finished serializing all chunks.")
示例#2
0
    def update(self, corpus):
        save_freq = max(1, int(10000 / self.chunksize))  # save every 10k docs, roughly
        chunks_processed = 0
        start_time = time.clock()

        while True:
            for chunk in utils.grouper(corpus, self.chunksize):
                self.update_chunk(chunk)
                self.m_num_docs_processed += len(chunk)
                chunks_processed += 1

                if self.update_finished(start_time, chunks_processed, self.m_num_docs_processed):
                    self.update_expectations()
                    alpha, beta = self.hdp_to_lda()
                    self.lda_alpha = alpha
                    self.lda_beta = beta
                    self.print_topics(20)
                    if self.outputdir:
                        self.save_topics()
                    return

                elif chunks_processed % save_freq == 0:
                    self.update_expectations()
                    # self.save_topics(self.m_num_docs_processed)
                    self.print_topics(20)
                    logger.info('PROGRESS: finished document %i of %i', self.m_num_docs_processed, self.m_D)
示例#3
0
 def __iter__(self):
     if self.chunksize:
         for chunk in utils.grouper(self.corpus, self.chunksize):
             for transformed in self.obj.__getitem__(chunk, chunksize=None):
                 yield transformed
     else:
         for doc in self.corpus:
             yield self.obj[doc]
示例#4
0
        def transformed_corpus():
            for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)):
                ln.debug("Converting chunk %s to csc format.." % chunk_no)
                chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality)
                ln.debug("Computing hidden representation for chunk.. ")
                hidden = self._get_hidden_representations(chunk)
                ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" %
                        (chunk_no, chunk_no * chunksize + len(doc_chunk)))
                for column in hidden.T:
                    yield matutils.dense2vec(column.T)
                ln.debug("Done yielding chunk %s" % chunk_no)

            ln.info("Finished computing representations for all chunks.")
示例#5
0
    def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda,
                    ldapost, iter_, bound, lda_inference_max_iter, chunksize):
        """
        Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound.
        Need to pass the LdaSeq model, corpus, sufficient stats, gammas and lhoods matrices previously created,
        and LdaModel and LdaPost class objects.
        """
        doc_index = 0  # overall doc_index in corpus
        time = 0  # current time-slice
        doc_num = 0  # doc-index in current time-slice
        lda = self.make_lda_seq_slice(lda, time)  # create lda_seq slice

        time_slice = np.cumsum(np.array(self.time_slice))

        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            # iterates chunk size for constant memory footprint
            for doc in chunk:
                # this is used to update the time_slice and create a new lda_seq slice every new time_slice
                if doc_index > time_slice[time]:
                    time += 1
                    lda = self.make_lda_seq_slice(lda, time)  # create lda_seq slice
                    doc_num = 0

                gam = gammas[doc_index]
                lhood = lhoods[doc_index]

                ldapost.gamma = gam
                ldapost.lhood = lhood
                ldapost.doc = doc

                # TODO: replace fit_lda_post with appropriate ldamodel functions, if possible.
                if iter_ == 0:
                    doc_lhood = LdaPost.fit_lda_post(
                        ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter
                    )
                else:
                    doc_lhood = LdaPost.fit_lda_post(
                        ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter
                    )

                if topic_suffstats is not None:
                    topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats)

                gammas[doc_index] = ldapost.gamma
                bound += doc_lhood
                doc_index += 1
                doc_num += 1

        return bound, gammas
示例#6
0
    def __iter__(self):
        for chunk_no, chunk in enumerate(utils.grouper(self.corpus, self.chunksize)):
            nnz = sum(len(doc) for doc in chunk)
            # construct the job as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense matrix!
            # ln.debug("converting corpus to csc format")
            if self.dense:
                job = matutils.corpus2dense(chunk, num_docs=len(chunk), num_terms=self.num_terms)
            else:
                job = matutils.corpus2csc(chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz)

            if self.filter_dimensions is not None:
                filtered = job[self.filter_dimensions, :]
            else:
                filtered = None

            yield job, filtered
            del chunk
示例#7
0
    def __iter__(self):
        """Iterate over the corpus.

        If `chunksize` is set, works in "batch-manner" (more efficient).

        Yields
        ------
        list of (int, number)
            Document in BoW format

        """
        if self.chunksize:
            for chunk in utils.grouper(self.corpus, self.chunksize):
                for transformed in self.obj.__getitem__(chunk, chunksize=None):
                    yield transformed
        else:
            for doc in self.corpus:
                yield self.obj[doc]
示例#8
0
    def __iter__(self):
        """Iterate over the corpus, applying the selected transformation.

        If `chunksize` was set in the constructor, works in "batch-manner" (more efficient).

        Yields
        ------
        list of (int, number)
            Documents in the sparse Gensim bag-of-words format.

        """
        if self.chunksize:
            for chunk in utils.grouper(self.corpus, self.chunksize):
                for transformed in self.obj.__getitem__(chunk, chunksize=None):
                    yield transformed
        else:
            for doc in self.corpus:
                yield self.obj[doc]
示例#9
0
    def __init__(self, noise, lambda_, input_dimensionality, output_dimensionality=None, prototype_ids=None):
        self.noise = noise
        self.lambda_ = lambda_
        self.input_dimensionality = input_dimensionality

        self.output_dimensionality = output_dimensionality or input_dimensionality
        if self.output_dimensionality != self.input_dimensionality:
            if prototype_ids is None:
                ln.warn("Need prototype IDs to train reduction layer.")

        self.randomized_indices = list(utils.grouper(np.random.permutation(self.input_dimensionality),
                                                     self.output_dimensionality))
        for idx_batch in self.randomized_indices:
            idx_batch.sort()  # should be more efficient when selecting array rows in order later on

        self.prototype_ids = prototype_ids

        self.num_folds = int(np.ceil(float(self.input_dimensionality) / self.output_dimensionality))
        self.blocks = []
示例#10
0
    def update(self, corpus):
        """Train the model with new documents, by EM-iterating over `corpus` until any of the conditions is satisfied.

        * time limit expired
        * chunk limit reached
        * whole corpus processed

        Parameters
        ----------
        corpus : iterable of list of (int, float)
            Corpus in BoW format.

        """
        save_freq = max(1, int(10000 / self.chunksize))  # save every 10k docs, roughly
        chunks_processed = 0
        start_time = time.clock()

        while True:
            for chunk in utils.grouper(corpus, self.chunksize):
                self.update_chunk(chunk)
                self.m_num_docs_processed += len(chunk)
                chunks_processed += 1

                if self.update_finished(start_time, chunks_processed, self.m_num_docs_processed):
                    self.update_expectations()
                    alpha, beta = self.hdp_to_lda()
                    self.lda_alpha = alpha
                    self.lda_beta = beta
                    self.print_topics(20)
                    if self.outputdir:
                        self.save_topics()
                    return

                elif chunks_processed % save_freq == 0:
                    self.update_expectations()
                    # self.save_topics(self.m_num_docs_processed)
                    self.print_topics(20)
                    logger.info('PROGRESS: finished document %i of %i', self.m_num_docs_processed, self.m_D)
示例#11
0
    def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=None):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached).

        In distributed mode, the E step is distributed over a cluster of machines.

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>.
        """
        # use parameters given in constructor, unless user explicitly overrode them
        if chunksize is None:
            chunksize = self.chunksize
        if decay is None:
            decay = self.decay
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every

        # rho is the "speed" of updating; TODO try other fncs
        rho = lambda: pow(1.0 + self.num_updates, -decay)

        try:
            lencorpus = len(corpus)
        except:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaModel.update() called with an empty corpus")
            return
        self.state.numdocs += lencorpus

        if update_every > 0:
            updatetype = "online"
            updateafter = min(lencorpus, update_every * self.numworkers * chunksize)
        else:
            updatetype = "batch"
            updateafter = lencorpus

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info("running %s LDA training, %s topics, %i passes over "
                    "the supplied corpus of %i documents, updating model once "
                    "every %i documents" %
                    (updatetype, self.num_topics, passes, lencorpus, updateafter))
        if updates_per_pass * passes < 10:
            logger.warning("too few updates, training might not converge; consider "
                           "increasing the number of passes to improve accuracy")

        for iteration in xrange(passes):
            ##### reset all workers
            if self.dispatcher:
                status = MPI.Status()

                ##### send reset message, with current state; ensure all ready
                for i in xrange(self.numworkers):
                    self.comm.sendrecv(self.state, dest=i+1, sendtag=RESET, source=i+1)

                logger.info('initializing %s workers' % self.numworkers)

            else:
                other = LdaState(self.eta, self.state.sstats.shape)
            dirty = False

            ##### counters for jobs
            count_sent = 0
            count_recv = 0

            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=True)):
                if self.dispatcher:

                    ##### send some work
                    logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' %
                                (iteration, chunk_no * chunksize + len(chunk), lencorpus))
                    count_sent += 1
                    status = MPI.Status()

                    ##### send the initial batch
                    if (chunk_no < self.numworkers):
                        self.comm.send(chunk, dest=chunk_no+1, tag=WORK)

                    ##### send work if we just cleaned out the workers
                    elif not dirty:
                        self.comm.send(chunk, 
                                dest=(chunk_no % self.numworkers)+1, tag=WORK)

                    ##### wait around for ready workers
                    else:
                        self.comm.recv(source=MPI.ANY_SOURCE, 
                                tag=MPI.ANY_TAG, status=status)
                        source = status.Get_source()
                        count_recv += 1
                        self.comm.send(chunk, dest=source, tag=WORK)

                else:
                    logger.info('PROGRESS: iteration %i, at document #%i/%i' %
                                (iteration, chunk_no * chunksize + len(chunk), lencorpus))
                    self.do_estep(chunk, other)
                dirty = True
                del chunk

                # perform an M step. determine when based on update_every, don't do this after every chunk
                if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:

                    ##### wait for all workers to finish
                    if self.dispatcher:
                        logger.info("reached the end of input; now waiting for all remaining jobs to finish")

                        ##### workers are finishing up
                        while (count_recv < count_sent):
                            self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
                            count_recv += 1

                        ##### placeholder for the result
                        result = None
                        result_recv = 0

                        ##### send the merge/clear messages
                        for i in xrange(self.numworkers):
                            self.comm.send(None, dest=i+1, tag=MERGE)

                        ##### wait for all results
                        while (result_recv < self.numworkers):
                            r = self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
                            result_recv += 1
                            if result_recv == 1:
                                result = r
                            else:
                                result.merge(r)
                        other = result

                    self.do_mstep(rho(), other)
                    del other # free up some mem

                    if self.dispatcher:
                        logger.info('initializing workers')

                        ##### send reset message, with current state
                        for i in xrange(self.numworkers):
                            self.comm.sendrecv(self.state, dest=i+1, sendtag=RESET, source=i+1)

                    else:
                        other = LdaState(self.eta, self.state.sstats.shape)
                    dirty = False
            #endfor single corpus iteration

            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")

                    ##### workers are finishing up
                    while (count_recv < count_sent):
                        self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
                        count_recv += 1

                    ##### placeholder for the result
                    result = None
                    result_recv = 0

                    ##### send the merge/clear messages
                    for i in xrange(self.numworkers):
                        self.comm.send(None, dest=i+1, tag=MERGE)

                    ##### wait for all results
                    while (result_recv < self.numworkers):
                        r = self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
                        result_recv += 1
                        if result_recv == 1:
                            result = r
                        else:
                            result.merge(r)
                    other = result

                self.do_mstep(rho(), other)
                del other
                dirty = False
        #endfor entire corpus update

        ##### kill the workers
        if self.dispatcher:
            for i in xrange(self.numworkers):
                self.comm.send(None, dest=i+1, tag=DIE)
            logger.info("workers are dead")
示例#12
0
    def train(self, sentences, total_words=None, word_count=0, chunksize=100, total_examples=None, queue_factor=2, report_delay=1):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        For word2mat, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)

        To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples
        (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the
        sentences are the same as those that were used to initially build the vocabulary.

        """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("C extension not loaded for word2mat, training will be slow. "
                          "Install a C compiler and reinstall gensim for fast training.")
            self.neg_labels = []
            if self.negative > 0:
                # precompute negative labels optimization for pure-python training
                self.neg_labels = zeros(self.negative + 1)
                self.neg_labels[0] = 1.

        logger.info(
            "training model with %i workers on %i vocabulary and %i features, "
            "using sg=%s hs=%s sample=%s and negative=%s",
            self.workers, len(self.vocab), self.layer1_size, self.sg,
            self.hs, self.sample, self.negative)

        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")
        if not hasattr(self, 'syn0'):
            raise RuntimeError("you must first finalize vocabulary before training the model")

        if total_words is None and total_examples is None:
            if self.corpus_count:
                total_examples = self.corpus_count
                logger.info("expecting %i examples, matching count from corpus used for vocabulary survey", total_examples)
            else:
                raise ValueError("you must provide either total_words or total_examples, to enable alpha and progress calculations")
        logging.info("initiallize sentence")
        sentences = EnumerateSentence(sentences)
        logging.info("initiallize sentence finish")
        if self.iter > 1:
            sentences = utils.RepeatCorpusNTimes(sentences, self.iter)
            total_words = total_words and total_words * self.iter
            total_examples = total_examples and total_examples * self.iter

        def worker_init():
            work = matutils.zeros_aligned(self.layer1_size, dtype=REAL)  # per-thread private work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            context_vector = matutils.zeros_aligned(self.topic_size,dtype=REAL)
            return (work, neu1,context_vector)

        def worker_one_job(job, inits):
            items, alpha = job
            if items is None:  # signal to finish
                return False
            # train & return tally
            tally, raw_tally = self._do_train_job(items, alpha, inits)
            progress_queue.put((len(items), tally, raw_tally))  # report progress
            return True

        def worker_loop():
            """Train the model, lifting lists of sentences from the jobs queue."""
            init = worker_init()
            while True:
                job = job_queue.get()
                if not worker_one_job(job, init):
                    break

        start, next_report = default_timer(), 1.0

        # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        if self.workers > 0:
            job_queue = Queue(maxsize=queue_factor * self.workers)
        else:
            job_queue = FakeJobQueue(worker_init, worker_one_job)
        progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

        workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        pushed_words = 0
        pushed_examples = 0
        example_count = 0
        trained_word_count = 0
        raw_word_count = word_count
        push_done = False
        done_jobs = 0
        next_alpha = self.alpha
        jobs_source = enumerate(utils.grouper(sentences, chunksize))
        # fill jobs queue with (sentence, alpha) job tuples
        while True:
            try:
                job_no, items = next(jobs_source)
                logger.debug("putting job #%i in the queue at alpha %.05f", job_no, next_alpha)
                job_queue.put((items, next_alpha))
                # update the learning rate before every next job
                if self.min_alpha < next_alpha:
                    if total_examples:
                        # examples-based decay
                        pushed_examples += len(items)
                        next_alpha = self.alpha - (self.alpha - self.min_alpha) * (pushed_examples / total_examples)
                    else:
                        # words-based decay
                        #pushed_words += self._raw_word_count(items)
                        pushed_words += self._raw_word_count([item[1] for item in items])
                        next_alpha = self.alpha - (self.alpha - self.min_alpha) * (pushed_words / total_words)
                    next_alpha = max(next_alpha, self.min_alpha)
            except StopIteration:
                logger.info(
                    "reached end of input; waiting to finish %i outstanding jobs",
                    job_no - done_jobs + 1)
                for _ in xrange(self.workers):
                    job_queue.put((None, 0))  # give the workers heads up that they can finish -- no more work!
                push_done = True
            try:
                while done_jobs < (job_no+1) or not push_done:
                    examples, trained_words, raw_words = progress_queue.get(push_done)  # only block after all jobs pushed
                    example_count += examples
                    trained_word_count += trained_words  # only words in vocab & sampled
                    raw_word_count += raw_words
                    done_jobs += 1
                    elapsed = default_timer() - start
                    if elapsed >= next_report:
                        if total_examples:
                            # examples-based progress %
                            logger.info(
                                "PROGRESS: at %.2f%% examples, %.0f words/s",
                                100.0 * example_count / total_examples, trained_word_count / elapsed)
                        else:
                            # words-based progress %
                            logger.info(
                                "PROGRESS: at %.2f%% words, %.0f words/s",
                                100.0 * raw_word_count / total_words, trained_word_count / elapsed)
                        next_report = elapsed + report_delay  # don't flood log, wait report_delay seconds
                else:
                    # loop ended by job count; really done
                    break
            except Empty:
                pass  # already out of loop; continue to next push

        elapsed = default_timer() - start
        logger.info(
            "training on %i raw words took %.1fs, %.0f trained words/s",
            raw_word_count, elapsed, trained_word_count / elapsed if elapsed else 0.0)

        if total_examples and total_examples != example_count:
            logger.warn("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples)
        if total_words and total_words != raw_word_count:
            logger.warn("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words)

        self.train_count += 1  # number of times train() has been called
        self.total_train_time += elapsed
        self.clear_sims()
        return trained_word_count
示例#13
0
    def once_test(self,
                  sentences,
                  total_words=None,
                  word_count=0,
                  chunksize=100):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of unicode strings.

        """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn(
                "Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`"
            )
        logger.info(
            "training model with %i workers on %i vocabulary and %i features, "
            "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s"
            % (self.workers, len(self.vocab), self.layer1_size, self.sg,
               self.hs, self.sample, self.negative))

        if not self.vocab:
            raise RuntimeError(
                "you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        word_count = [word_count]
        #total_words = total_words or int(sum(v.count * v.sample_probability for v in itervalues(self.vocab)))
        total_words = 0
        for i in range(len(sentences)):
            total_words += len(sentences[i])

        jobs = Queue(
            maxsize=2 * self.workers
        )  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock(
        )  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = zeros(
                self.layer1_size * self.window,
                dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size * self.window,
                                          dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(
                    self.min_alpha,
                    self.alpha *
                    (1 - 1.0 *
                     (word_count[0] + self.now_iterated * total_words) /
                     (total_words * self.iteration)))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count

                job_words = sum(
                    train_sentence_test(self, sentence[0], sentence[1], alpha,
                                        work, neu1) for sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        print "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (
                            100.0 *
                            (word_count[0] + self.now_iterated * total_words) /
                            (total_words * self.iteration), alpha,
                            word_count[0] / elapsed if elapsed else 0.0)
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [
            threading.Thread(target=worker_train) for _ in xrange(self.workers)
        ]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        def prepare_sentences():
            for number, sentence in enumerate(sentences):
                # avoid calling random_sample() where prob >= 1, to speed things up a little:
                sampled = [
                    self.vocab[word] for word in sentence
                    if word in self.vocab and
                    (self.vocab[word].sample_probability >= 1.0 or self.
                     vocab[word].sample_probability >= random.random_sample())
                ]
                sampled = (number, sampled)
                yield sampled

        # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
        for job_no, job in enumerate(
                utils.grouper(prepare_sentences(), chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" %
                         (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info(
            "reached the end of input; waiting to finish %i outstanding jobs" %
            jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(
                None
            )  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
                    (word_count[0], elapsed,
                     word_count[0] / elapsed if elapsed else 0.0))

        return word_count[0]
示例#14
0
    def train(self, texts, chunksize=100, workers=2):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of utf8 strings.

        """
        logger.info("training model with %i workers" % (workers))

        start, next_report = time.time(), [1.0]
        jobs = Queue(
            maxsize=2 * workers
        )  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock(
        )  # for shared state (=number of words trained so far, log reports...)

        total_error = [0.0]
        objects_done = [0]

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            observation_work = np.zeros(self.window * self.size +
                                        self.object_size,
                                        dtype=REAL)
            prediction_work = np.zeros(self.output_size, dtype=REAL)
            composition_work = np.zeros(
                [
                    max(self.output_size, self.window * self.size +
                        self.object_size), self.window * self.size +
                    self.object_size
                ],
                dtype=REAL) if self.bilinear_form else None

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                error = sum(
                    train_sentence_concatenation(
                        self, sentence, object_index, softmax_target,
                        sigmoid_target, self._alpha, prediction_work,
                        observation_work, composition_work) for sentence,
                    object_index, softmax_target, sigmoid_target in job)
                with lock:
                    total_error[0] += error
                    objects_done[0] += len(job)
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info("PROGRESS: %s objects, %.0f objects/s" %
                                    (objects_done[0], float(objects_done[0]) /
                                     elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        dynos = [
            threading.Thread(target=worker_train) for _ in range(0, workers)
        ]
        for thread in dynos:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue
        no_oov = ((np.array([self.vocab.get_index(word) for word in sentence],
                            dtype=INT), object_index, softmax_target,
                   sigmoid_target) for sentence, object_index, softmax_target,
                  sigmoid_target in texts)
        for job_no, job in enumerate(gensim_utils.grouper(no_oov, chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" %
                         (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info(
            "reached the end of input; waiting to finish %i outstanding jobs" %
            jobs.qsize())

        for _ in range(0, workers):
            jobs.put(
                None
            )  # give the workers heads up that they can finish -- no more work!

        for thread in dynos:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i objects took %.1fs, %.0f words/s" %
                    (objects_done[0], elapsed,
                     objects_done[0] / elapsed if elapsed else 0.0))

        return (objects_done[0], total_error[0])
示例#15
0
    def train(self, sentences, total_words=None, word_count=0, chunksize=100):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of unicode strings.

        """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("C extension compilation failed, training will be slow. Install a C compiler and reinstall gensim for fast training.")
        logger.info("training model with %i workers on %i vocabulary and %i features, "
            "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s" %
            (self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative))

        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        word_count = [word_count]
        total_words = total_words or int(sum(v.count * v.sample_probability for v in itervalues(self.vocab)) * self.iter)
        jobs = Queue(maxsize=2 * self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock()  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = zeros(self.layer1_size, dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                job_words = self._get_job_words(alpha, work, job, neu1)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" %
                            (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
        for job_no, job in enumerate(utils.grouper(self._prepare_sentences(sentences), chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(None)  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
            (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0))
        self.syn0norm = None
        return word_count[0]
示例#16
0
    def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, passes=None,
               update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached). `corpus` must be an iterable (repeatable stream of documents),

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>. Additionally, for smaller
        `corpus` sizes, an increasing `offset` may be beneficial (see
        Table 1 in Hoffman et al.)

        If update is called with authors that already exist in the model, it will
        resume training on not only new documents for that author, but also the
        previously seen documents. This is necessary for those authors' topic
        distributions to converge.

        Every time `update(corpus, author2doc)` is called, the new documents are
        to appended to all the previously seen documents, and author2doc is
        combined with the previously seen authors.

        To resume training on all the data seen by the model, simply call
        `update()`.

        It is not possible to add new authors to existing documents, as all
        documents in `corpus` are assumed to be new documents.

        Args:
            corpus (gensim corpus): The corpus with which the author-topic model should be updated.

            author2doc (dictionary): author to document mapping corresponding to indexes in input
                corpus.

            doc2author (dictionary): document to author mapping corresponding to indexes in input
                corpus.

            chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np
                array of not. np can in some settings turn the term IDs
                into floats, these will be converted back into integers in
                inference, which incurs a performance hit. For distributed
                computing it may be desirable to keep the chunks as np
                arrays.

        For other parameter settings, see :class:`AuthorTopicModel` constructor.

        """

        # use parameters given in constructor, unless user explicitly overrode them
        if decay is None:
            decay = self.decay
        if offset is None:
            offset = self.offset
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every
        if eval_every is None:
            eval_every = self.eval_every
        if iterations is None:
            iterations = self.iterations
        if gamma_threshold is None:
            gamma_threshold = self.gamma_threshold

        # TODO: if deepcopy is not used here, something goes wrong. When unit tests are run (specifically "testPasses"),
        # the process simply gets killed.
        author2doc = deepcopy(author2doc)
        doc2author = deepcopy(doc2author)

        # TODO: it is not possible to add new authors to an existing document (all input documents are treated
        # as completely new documents). Perhaps this functionality could be implemented.
        # If it's absolutely necessary, the user can delete the documents that have new authors, and call update
        # on them with the new and old authors.

        if corpus is None:
            # Just keep training on the already available data.
            # Assumes self.update() has been called before with input documents and corresponding authors.
            assert self.total_docs > 0, 'update() was called with no documents to train on.'
            train_corpus_idx = [d for d in xrange(self.total_docs)]
            num_input_authors = len(self.author2doc)
        else:
            if doc2author is None and author2doc is None:
                raise ValueError('at least one of author2doc/doc2author must be specified, to establish input space dimensionality')

            # If either doc2author or author2doc is missing, construct them from the other.
            if doc2author is None:
                doc2author = construct_doc2author(corpus, author2doc)
            elif author2doc is None:
                author2doc = construct_author2doc(doc2author)

            # Number of authors that need to be updated.
            num_input_authors = len(author2doc)

            try:
                len_input_corpus = len(corpus)
            except TypeError:
                logger.warning("input corpus stream has no len(); counting documents")
                len_input_corpus = sum(1 for _ in corpus)
            if len_input_corpus == 0:
                logger.warning("AuthorTopicModel.update() called with an empty corpus")
                return

            self.total_docs += len_input_corpus

            # Add new documents in corpus to self.corpus.
            self.extend_corpus(corpus)

            # Obtain a list of new authors.
            new_authors = []
            # Sorting the author names makes the model more reproducible.
            for a in sorted(author2doc.keys()):
                if not self.author2doc.get(a):
                    new_authors.append(a)

            num_new_authors = len(new_authors)

            # Add new authors do author2id/id2author dictionaries.
            for a_id, a_name in enumerate(new_authors):
                self.author2id[a_name] = a_id + self.num_authors
                self.id2author[a_id + self.num_authors] = a_name

            # Increment the number of total authors seen.
            self.num_authors += num_new_authors

            # Initialize the variational distributions q(theta|gamma)
            gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics))
            self.state.gamma = np.vstack([self.state.gamma, gamma_new])

            # Combine author2doc with self.author2doc.
            # First, increment the document IDs by the number of previously seen documents.
            for a, doc_ids in author2doc.items():
                doc_ids = [d + self.total_docs - len_input_corpus for d in doc_ids]

            # For all authors in the input corpus, add the new documents.
            for a, doc_ids in author2doc.items():
                if self.author2doc.get(a):
                    # This is not a new author, append new documents.
                    self.author2doc[a].extend(doc_ids)
                else:
                    # This is a new author, create index.
                    self.author2doc[a] = doc_ids

            # Add all new documents to self.doc2author.
            for d, a_list in doc2author.items():
                self.doc2author[d] = a_list

            # Train on all documents of authors in input_corpus.
            train_corpus_idx = []
            for _ in author2doc.keys():  # For all authors in input corpus.
                for doc_ids in self.author2doc.values():  # For all documents in total corpus.
                    train_corpus_idx.extend(doc_ids)

            # Make the list of training documents unique.
            train_corpus_idx = list(set(train_corpus_idx))

        # train_corpus_idx is only a list of indexes, so "len" is valid.
        lencorpus = len(train_corpus_idx)

        if chunksize is None:
            chunksize = min(lencorpus, self.chunksize)

        self.state.numdocs += lencorpus

        if update_every:
            updatetype = "online"
            updateafter = min(lencorpus, update_every * self.numworkers * chunksize)
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize)

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info(
            "running %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once "
            "every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %f",
            updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter,
            evalafter, iterations, gamma_threshold
        )

        if updates_per_pass * passes < 10:
            logger.warning("too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy")

        # rho is the "speed" of updating; TODO try other fncs
        # pass_ + num_updates handles increasing the starting t for each pass,
        # while allowing it to "reset" on the first pass of each update
        def rho():
            return pow(offset + pass_ + (self.num_updates / chunksize), -decay)

        for pass_ in xrange(passes):
            if self.dispatcher:
                logger.info('initializing %s workers', self.numworkers)
                self.dispatcher.reset(self.state)
            else:
                # gamma is not needed in "other", thus its shape is (0, 0).
                other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0))
            dirty = False

            reallen = 0
            for chunk_no, chunk_doc_idx in enumerate(utils.grouper(train_corpus_idx, chunksize, as_numpy=chunks_as_numpy)):
                chunk = [self.corpus[d] for d in chunk_doc_idx]
                reallen += len(chunk)  # keep track of how many documents we've processed so far

                if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                    # log_perplexity requires the indexes of the documents being evaluated, to know what authors
                    # correspond to the documents.
                    self.log_perplexity(chunk, chunk_doc_idx, total_docs=lencorpus)

                if self.dispatcher:
                    # add the chunk to dispatcher's job queue, so workers can munch on it
                    logger.info(
                        "PROGRESS: pass %i, dispatching documents up to #%i/%i",
                        pass_, chunk_no * chunksize + len(chunk), lencorpus
                    )
                    # this will eventually block until some jobs finish, because the queue has a small finite length
                    self.dispatcher.putjob(chunk)
                else:
                    logger.info(
                        "PROGRESS: pass %i, at document #%i/%i",
                        pass_, chunk_no * chunksize + len(chunk), lencorpus
                    )
                    # do_estep requires the indexes of the documents being trained on, to know what authors
                    # correspond to the documents.
                    gammat = self.do_estep(chunk, self.author2doc, self.doc2author, rho(), other, chunk_doc_idx)

                    if self.optimize_alpha:
                        self.update_alpha(gammat, rho())

                dirty = True
                del chunk

                # perform an M step. determine when based on update_every, don't do this after every chunk
                if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
                    if self.dispatcher:
                        # distributed mode: wait for all workers to finish
                        logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                        other = self.dispatcher.getstate()
                    self.do_mstep(rho(), other, pass_ > 0)
                    del other  # frees up memory

                    if self.dispatcher:
                        logger.info('initializing workers')
                        self.dispatcher.reset(self.state)
                    else:
                        other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0))
                    dirty = False
            # endfor single corpus iteration
            if reallen != lencorpus:
                raise RuntimeError("input corpus size changed during training (don't use generators as input)")

            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    other = self.dispatcher.getstate()
                self.do_mstep(rho(), other, pass_ > 0)
                del other
示例#17
0
    def update(self, corpus, chunks_as_numpy=False):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached). `corpus` must be an iterable (repeatable stream of documents),

        The E-step is distributed into the several processes.

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>.

        """
        try:
            lencorpus = len(corpus)
        except:
            logger.warning(
                "input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaMulticore.update() called with an empty corpus")
            return

        self.state.numdocs += lencorpus

        if not self.batch:
            updatetype = "online"
            updateafter = self.chunksize * self.workers
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus, (self.eval_every or 0) * updateafter)

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info(
            "running %s LDA training, %s topics, %i passes over the"
            " supplied corpus of %i documents, updating every %i documents,"
            " evaluating every ~%i documents, iterating %ix with a convergence threshold of %f",
            updatetype, self.num_topics, self.passes, lencorpus, updateafter,
            evalafter, self.iterations, self.gamma_threshold)

        if updates_per_pass * self.passes < 10:
            logger.warning(
                "too few updates, training might not converge; consider "
                "increasing the number of passes or iterations to improve accuracy"
            )

        job_queue = Queue(maxsize=2 * self.workers)
        result_queue = Queue()

        # rho is the "speed" of updating; TODO try other fncs
        # pass_ + num_updates handles increasing the starting t for each pass,
        # while allowing it to "reset" on the first pass of each update
        def rho():
            return pow(
                self.offset + pass_ + (self.num_updates / self.chunksize),
                -self.decay)

        logger.info("training LDA model using %i processes", self.workers)
        pool = Pool(self.workers, worker_e_step, (
            job_queue,
            result_queue,
        ))
        for pass_ in xrange(self.passes):
            queue_size, reallen = [0], 0
            other = LdaState(self.eta, self.state.sstats.shape)

            def process_result_queue(force=False):
                """
                Clear the result queue, merging all intermediate results, and update the
                LDA model if necessary.

                """
                merged_new = False
                while not result_queue.empty():
                    other.merge(result_queue.get())
                    queue_size[0] -= 1
                    merged_new = True
                if (force and merged_new and queue_size[0] == 0) or (
                        not self.batch and (other.numdocs >= updateafter)):
                    self.do_mstep(rho(), other, pass_ > 0)
                    other.reset()
                    if self.eval_every is not None and (
                        (force and queue_size[0] == 0) or
                        (self.eval_every != 0 and
                         (self.num_updates / updateafter) % self.eval_every
                         == 0)):
                        self.log_perplexity(chunk, total_docs=lencorpus)

            chunk_stream = utils.grouper(corpus,
                                         self.chunksize,
                                         as_numpy=chunks_as_numpy)
            for chunk_no, chunk in enumerate(chunk_stream):
                reallen += len(
                    chunk
                )  # keep track of how many documents we've processed so far

                # put the chunk into the workers' input job queue
                chunk_put = False
                while not chunk_put:
                    try:
                        job_queue.put((chunk_no, chunk, self),
                                      block=False,
                                      timeout=0.1)
                        chunk_put = True
                        queue_size[0] += 1
                        logger.info(
                            'PROGRESS: pass %i, dispatched chunk #%i = '
                            'documents up to #%i/%i, outstanding queue size %i',
                            pass_, chunk_no,
                            chunk_no * self.chunksize + len(chunk), lencorpus,
                            queue_size[0])
                    except queue.Full:
                        # in case the input job queue is full, keep clearing the
                        # result queue, to make sure we don't deadlock
                        process_result_queue()

                process_result_queue()
            #endfor single corpus pass

            # wait for all outstanding jobs to finish
            while queue_size[0] > 0:
                process_result_queue(force=True)

            if reallen != lencorpus:
                raise RuntimeError(
                    "input corpus size changed during training (don't use generators as input)"
                )
        #endfor entire update

        pool.terminate()
示例#18
0
    def train(self,
              input_file=None,
              total_words=None,
              word_count=0,
              chunksize=100,
              alpha=0.025,
              alpha_doc=0.025,
              sentences_length=None):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of utf8 strings.

        """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn(
                "Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`"
            )
        logger.info(
            "training model with %i workers on %i vocabulary and %i features, "
            "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s"
            % (self.workers, len(self.vocab), self.layer1_size, self.sg,
               self.hs, self.sample, self.negative))

        if not self.vocab:
            raise RuntimeError(
                "you must first build vocabulary before training the model")
        '''学習率を設定'''
        self.alpha_doc = float(alpha_doc)
        self.alpha = float(alpha)

        start, next_report = time.time(), [1.0]
        word_count = [word_count]
        total_words = total_words or int(
            sum(v.count * v.sample_probability
                for v in itervalues(self.vocab)))
        jobs = Queue(
            maxsize=2 * self.workers
        )  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock(
        )  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = zeros(
                self.syn1_size,
                dtype=REAL)  # each thread must have its own work memory
            # neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            neu1 = zeros(self.syn1_size, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break

                # update the learning rate before every job
                alpha = max(
                    self.min_alpha,
                    self.alpha * (1 - 1.0 * word_count[0] / total_words))
                if self.alpha_flag == 1:
                    alpha = self.alpha
                # print "alpha", alpha
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                if self.sg:

                    if self.skip_gram_type == 0:
                        # job_id = 0
                        # sentence_id_,_ = job[job_id]
                        # print "py sentence_id = ",sentence_id_
                        # bf = deepcopy(self.doc[sentence_id_][0])
                        # print "bf : ",bf
                        job_words = sum(
                            train_sentence_sg_simple(self, sentence_id,
                                                     sentence, alpha, work,
                                                     self.alpha_doc)
                            for sentence_id, sentence in job)
                        # print "af : ",self.doc[sentence_id_][0]
                        # print "re : ", self.doc[sentence_id_][0] - bf
                    elif self.skip_gram_type == 1:

                        # ids_back = [sentence_id for sentence_id,_ in job]
                        # bf_ = deepcopy(self.doc[ids_back])

                        job_words = sum(
                            train_sentence_sg_average(
                                self, sentence_id, sentence, alpha, work, neu1,
                                self.alpha_doc)
                            for sentence_id, sentence in job)

                        # af_ = self.doc[ids_back]
                        # print numpy.mean(af_ - bf_ )

                    elif self.skip_gram_type == 2:

                        ids_back = [sentence_id for sentence_id, _ in job]
                        bf_ = deepcopy(self.doc[ids_back])

                        job_words = sum(
                            train_sentence_sg_concat(
                                self, sentence_id, sentence, alpha, work, neu1,
                                self.alpha_doc)
                            for sentence_id, sentence in job)

                        af_ = self.doc[ids_back]
                        print numpy.mean(af_ - bf_)

                elif self.cbow_type == 4:

                    ids_back = [sentence_id for sentence_id, _ in job]
                    bf_ = deepcopy(self.doc[ids_back])

                    # job_words = sum(train_sentence_cbow_average_plus_doc_vec_extra_train(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job)
                    job_words = sum(
                        train_sentence_cbow_average_plus_doc(
                            self, sentence_id, sentence, alpha, work, neu1,
                            self.alpha_doc) for sentence_id, sentence in job)

                    af_ = self.doc[ids_back]
                    print numpy.mean(af_ - bf_)

                    # print "re : ", af_ - bf_
                # elif self.cbow_type == 5:
                #     job_words = sum(train_sentence_cbow_concatenate_v2(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job)
                elif self.cbow_type == 3:
                    job_id = 0
                    ids_back = [sentence_id for sentence_id, _ in job]
                    bf_ = deepcopy(self.doc[ids_back])
                    sentence_id_, sentence_ = job[job_id]
                    # # print "py sentence_id = ",sentence_id_
                    # bf = deepcopy(self.doc[sentence_id_])
                    # print "bf : ",bf
                    # print "null_vec", self.null_vec
                    job_words = sum(
                        train_sentence_cbow_concatenate(
                            self, sentence_id, sentence, alpha, work, neu1,
                            self.alpha_doc) for sentence_id, sentence in job)

                    # af_ = self.doc[ids_back]
                    # print numpy.mean(af_ - bf_ )
                    # print "af : ",self.doc[sentence_id_]
                    # print "re : ", self.doc[sentence_id_] - bf

                    # print sum(self.doc[sentence_id_] - bf)
                elif self.cbow_type == 2:
                    job_words = sum(
                        train_sentence_cbow_concatenate_syn1_doc(
                            self, sentence_id, sentence, alpha, work, neu1,
                            self.alpha_doc) for sentence_id, sentence in job)
                elif self.cbow_type == 1:

                    ids_back = [sentence_id for sentence_id, _ in job]
                    bf_ = deepcopy(self.doc[ids_back])
                    job_words = sum(
                        train_sentence_cbow_average_simple(
                            self, sentence_id, sentence, alpha, work, neu1,
                            self.alpha_doc) for sentence_id, sentence in job)

                    af_ = self.doc[ids_back]
                    print numpy.mean(af_ - bf_)
                    # print af_ - bf_

                elif self.cbow_type == 0:
                    job_words = sum(
                        train_sentence_cbow_syn1_doc(
                            self, sentence_id, sentence, alpha, work, neu1,
                            self.alpha_doc) for sentence_id, sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info(
                            "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s"
                            % (100.0 * word_count[0] / total_words, alpha,
                               word_count[0] / elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [
            threading.Thread(target=worker_train) for _ in xrange(self.workers)
        ]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        def prepare_sentences():
            '''
                ここでsentencesのindexをradom.shuffleしランダムに学習していく
            '''
            if self.random_learn_flag:
                # ランダムに入力データをシャッフルして学習する
                indexes_sentence_ids = numpy.array(range(sentences_length))
                random.shuffle(indexes_sentence_ids, lambda: random_seed)
                sentences = [(indexes_sentence_ids[index], sentence)
                             for index, sentence in enumerate(open(input_file))
                             ]

            else:
                sentences = enumerate(open(input_file))

            for sentence_id, sentence in sentences:
                sentence = sentence.split(u" ")
                # 途中まで学習している場合はスキップする(学習済モデルから追加で学習する場合)
                if sentence_id < self.skip_id:
                    print "skip! :" + str(sentence_id) + " " + str(
                        self.skip_id)
                    continue
                sampled = [
                    self.vocab[word] for word in sentence
                    if word in self.vocab and (
                        self.vocab[word].sample_probability >= 1.0
                        or self.vocab[word].sample_probability >=
                        numpy.random.random_sample())
                ]
                yield (sentence_id, sampled)

        # no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences)
        # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
        for job_no, job in enumerate(
                utils.grouper(prepare_sentences(), chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" %
                         (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info(
            "reached the end of input; waiting to finish %i outstanding jobs" %
            jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(
                None
            )  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
                    (word_count[0], elapsed,
                     word_count[0] / elapsed if elapsed else 0.0))

        return word_count[0]
示例#19
0
    def add_documents(self, corpus, chunksize=None, decay=None):
        """Update model with new `corpus`.

        Parameters
        ----------
        corpus : {iterable of list of (int, float), scipy.sparse.csc}
            Stream of document vectors or sparse matrix of shape (`num_terms`, num_documents).
        chunksize : int, optional
            Number of documents to be used in each training chunk, will use `self.chunksize` if not specified.
        decay : float, optional
            Weight of existing observations relatively to new ones,  will use `self.decay` if not specified.

        Notes
        -----
        Training proceeds in chunks of `chunksize` documents at a time. The size of `chunksize` is a tradeoff
        between increased speed (bigger `chunksize`) vs. lower memory footprint (smaller `chunksize`).
        If the distributed mode is on, each chunk is sent to a different worker/computer.

        """
        logger.info("updating model with new documents")

        # get computation parameters; if not specified, use the ones from constructor
        if chunksize is None:
            chunksize = self.chunksize
        if decay is None:
            decay = self.decay

        if not scipy.sparse.issparse(corpus):
            if not self.onepass:
                # we are allowed multiple passes over the input => use a faster, randomized two-pass algo
                update = Projection(self.num_terms,
                                    self.num_topics,
                                    None,
                                    dtype=self.dtype)
                update.u, update.s = stochastic_svd(
                    corpus,
                    self.num_topics,
                    num_terms=self.num_terms,
                    chunksize=chunksize,
                    extra_dims=self.extra_samples,
                    power_iters=self.power_iters,
                    dtype=self.dtype)
                self.projection.merge(update, decay=decay)
                self.docs_processed += len(corpus) if hasattr(
                    corpus, '__len__') else 0
            else:
                # the one-pass algo
                doc_no = 0
                if self.dispatcher:
                    logger.info('initializing %s workers', self.numworkers)
                    self.dispatcher.reset()
                for chunk_no, chunk in enumerate(
                        utils.grouper(corpus, chunksize)):
                    logger.info("preparing a new chunk of documents")
                    nnz = sum(len(doc) for doc in chunk)
                    # construct the job as a sparse matrix, to minimize memory overhead
                    # definitely avoid materializing it as a dense matrix!
                    logger.debug("converting corpus to csc format")
                    job = matutils.corpus2csc(chunk,
                                              num_docs=len(chunk),
                                              num_terms=self.num_terms,
                                              num_nnz=nnz,
                                              dtype=self.dtype)
                    del chunk
                    doc_no += job.shape[1]
                    if self.dispatcher:
                        # distributed version: add this job to the job queue, so workers can work on it
                        logger.debug("creating job #%i", chunk_no)
                        # put job into queue; this will eventually block, because the queue has a small finite size
                        self.dispatcher.putjob(job)
                        del job
                        logger.info("dispatched documents up to #%s", doc_no)
                    else:
                        # serial version, there is only one "worker" (myself) => process the job directly
                        update = Projection(self.num_terms,
                                            self.num_topics,
                                            job,
                                            extra_dims=self.extra_samples,
                                            power_iters=self.power_iters,
                                            dtype=self.dtype)
                        del job
                        self.projection.merge(update, decay=decay)
                        del update
                        logger.info("processed documents up to #%s", doc_no)
                        self.print_topics(5)

                # wait for all workers to finish (distributed version only)
                if self.dispatcher:
                    logger.info(
                        "reached the end of input; now waiting for all remaining jobs to finish"
                    )
                    self.projection = self.dispatcher.getstate()
                self.docs_processed += doc_no
        else:
            assert not self.dispatcher, "must be in serial mode to receive jobs"
            update = Projection(self.num_terms,
                                self.num_topics,
                                corpus.tocsc(),
                                extra_dims=self.extra_samples,
                                power_iters=self.power_iters,
                                dtype=self.dtype)
            self.projection.merge(update, decay=decay)
            logger.info("processed sparse job of %i documents",
                        corpus.shape[1])
            self.docs_processed += corpus.shape[1]
示例#20
0
def stochastic_svd(corpus,
                   rank,
                   num_terms,
                   chunksize=20000,
                   extra_dims=None,
                   power_iters=0,
                   dtype=np.float64,
                   eps=1e-6):
    """Run truncated Singular Value Decomposition (SVD) on a sparse input.

    Parameters
    ----------
    corpus : {iterable of list of (int, float), scipy.sparse}
        Input corpus as a stream (does not have to fit in RAM)
        or a sparse matrix of shape (`num_terms`, num_documents).
    rank : int
        Desired number of factors to be retained after decomposition.
    num_terms : int
        The number of features (terms) in `corpus`.
    chunksize :  int, optional
        Number of documents to be used in each training chunk.
    extra_dims : int, optional
        Extra samples to be used besides the rank `k`. Can improve accuracy.
    power_iters: int, optional
        Number of power iteration steps to be used. Increasing the number of power iterations improves accuracy,
        but lowers performance.
    dtype : numpy.dtype, optional
        Enforces a type for elements of the decomposed matrix.
    eps: float, optional
        Percentage of the spectrum's energy to be discarded.

    Notes
    -----
    The corpus may be larger than RAM (iterator of vectors), if `corpus` is a `scipy.sparse.csc` instead,
    it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen.
    This may return less than the requested number of top `rank` factors, in case the input itself is of lower rank.
    The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the
    decomposition.

    This algorithm uses `2 + power_iters` passes over the input data. In case you can only afford a single pass,
    set `onepass=True` in :class:`~gensim.models.lsimodel.LsiModel` and avoid using this function directly.

    The decomposition algorithm is based on `"Finding structure with randomness:
    Probabilistic algorithms for constructing approximate matrix decompositions" <https://arxiv.org/abs/0909.4061>`_.


    Returns
    -------
    (np.ndarray 2D, np.ndarray 1D)
        The left singular vectors and the singular values of the `corpus`.

    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(
            10, 2 * rank
        )  # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations",
                samples - rank, power_iters)

    num_terms = int(num_terms)

    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunksize` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = np.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st phase: constructing %s action matrix", str(y.shape))

    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (
            m, num_terms)
        o = np.random.normal(0.0, 1.0, (n, samples)).astype(
            y.dtype)  # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr,
                                corpus.indices, corpus.data, o.ravel(),
                                y.ravel())  # y = corpus * o
        del o

        # unlike np, scipy.sparse `astype()` copies everything, even if there is no change to dtype!
        # so check for equal dtype explicitly, to avoid the extra memory footprint if possible
        if y.dtype != dtype:
            y = y.astype(dtype)

        logger.info("orthonormalizing %s action matrix", str(y.shape))
        y = [y]
        q, _ = matutils.qr_destroy(y)  # orthonormalize the range

        logger.debug("running %i power iterations", power_iters)
        for _ in range(power_iters):
            q = corpus.T * q
            q = [corpus * q]
            q, _ = matutils.qr_destroy(
                q)  # orthonormalize the range after each power iteration step
    else:
        num_docs = 0
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i', (chunk_no * chunksize))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunksize) matrix!
            s = sum(len(doc) for doc in chunk)
            chunk = matutils.corpus2csc(
                chunk, num_terms=num_terms,
                dtype=dtype)  # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunksize  # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = np.random.normal(0.0, 1.0, (n, samples)).astype(
                dtype)  # draw a random gaussian matrix
            sparsetools.csc_matvecs(
                m,
                n,
                samples,
                chunk.indptr,
                chunk.indices,  # y = y + chunk * o
                chunk.data,
                o.ravel(),
                y.ravel())
            del chunk, o
        y = [y]
        q, _ = matutils.qr_destroy(y)  # orthonormalize the range

        for power_iter in range(power_iters):
            logger.info("running power iteration #%i", power_iter + 1)
            yold = q.copy()
            q[:] = 0.0
            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
                logger.info('PROGRESS: at document #%i/%i',
                            chunk_no * chunksize, num_docs)
                # documents = columns of sparse CSC
                chunk = matutils.corpus2csc(chunk,
                                            num_terms=num_terms,
                                            dtype=dtype)
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                q += tmp
            del yold
            q = [q]
            q, _ = matutils.qr_destroy(q)  # orthonormalize the range

    qt = q[:, :samples].T.copy()
    del q

    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix", str(b.shape))
        u, s, vt = scipy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunksize` documents from the streaming
        # input corpus A, to avoid using O(number of documents) memory
        x = np.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=dtype)
        logger.info("2nd phase: constructing %s covariance matrix",
                    str(x.shape))
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize,
                        num_docs)
            chunk = matutils.corpus2csc(chunk,
                                        num_terms=num_terms,
                                        dtype=qt.dtype)
            b = qt * chunk  # dense * sparse matrix multiply
            del chunk
            x += np.dot(
                b, b.T
            )  # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del b

        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix",
                    str(x.shape))
        # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        u, s, vt = scipy.linalg.svd(x)
        # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
        s = np.sqrt(s)
    q = qt.T.copy()
    del qt

    logger.info("computing the final decomposition")
    keep = clip_spectrum(s**2, rank, discard=eps)
    u = u[:, :keep].copy()
    s = s[:keep]
    u = np.dot(q, u)
    return u.astype(dtype), s.astype(dtype)
示例#21
0
    def update(self, corpus, chunks_as_numpy=False):
        """Train the model with new documents, by EM-iterating over `corpus` until the topics converge
        (or until the maximum number of allowed iterations is reached).

        Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until
        the maximum number of allowed iterations is reached. `corpus` must be an iterable. The E step is distributed
        into the several processes.

        Notes
        -----
        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>.

        Parameters
        ----------
        corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional
            Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`) used to update the
            model.
        chunks_as_numpy : bool
            Whether each chunk passed to the inference step should be a np.ndarray or not. Numpy can in some settings
            turn the term IDs into floats, these will be converted back into integers in inference, which incurs a
            performance hit. For distributed computing it may be desirable to keep the chunks as `numpy.ndarray`.

        """
        try:
            lencorpus = len(corpus)
        except TypeError:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaMulticore.update() called with an empty corpus")
            return

        self.state.numdocs += lencorpus

        if not self.batch:
            updatetype = "online"
            updateafter = self.chunksize * self.workers
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus, (self.eval_every or 0) * updateafter)

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info(
            "running %s LDA training, %s topics, %i passes over the supplied corpus of %i documents, "
            "updating every %i documents, evaluating every ~%i documents, "
            "iterating %ix with a convergence threshold of %f",
            updatetype, self.num_topics, self.passes, lencorpus, updateafter,
            evalafter, self.iterations, self.gamma_threshold
        )

        if updates_per_pass * self.passes < 10:
            logger.warning(
                "too few updates, training might not converge; "
                "consider increasing the number of passes or iterations to improve accuracy"
            )

        job_queue = Queue(maxsize=2 * self.workers)
        result_queue = Queue()

        # rho is the "speed" of updating; TODO try other fncs
        # pass_ + num_updates handles increasing the starting t for each pass,
        # while allowing it to "reset" on the first pass of each update
        def rho():
            return pow(self.offset + pass_ + (self.num_updates / self.chunksize), -self.decay)

        logger.info("training LDA model using %i processes", self.workers)
        pool = Pool(self.workers, worker_e_step, (job_queue, result_queue,))
        for pass_ in xrange(self.passes):
            queue_size, reallen = [0], 0
            other = LdaState(self.eta, self.state.sstats.shape)

            def process_result_queue(force=False):
                """
                Clear the result queue, merging all intermediate results, and update the
                LDA model if necessary.

                """
                merged_new = False
                while not result_queue.empty():
                    other.merge(result_queue.get())
                    queue_size[0] -= 1
                    merged_new = True
                if (force and merged_new and queue_size[0] == 0) or (not self.batch and (other.numdocs >= updateafter)):
                    self.do_mstep(rho(), other, pass_ > 0)
                    other.reset()
                    if self.eval_every is not None and \
                            ((force and queue_size[0] == 0) or
                                 (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)):
                        self.log_perplexity(chunk, total_docs=lencorpus)

            chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=chunks_as_numpy)
            for chunk_no, chunk in enumerate(chunk_stream):
                reallen += len(chunk)  # keep track of how many documents we've processed so far

                # put the chunk into the workers' input job queue
                chunk_put = False
                while not chunk_put:
                    try:
                        job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1)
                        chunk_put = True
                        queue_size[0] += 1
                        logger.info(
                            "PROGRESS: pass %i, dispatched chunk #%i = documents up to #%i/%i, "
                            "outstanding queue size %i",
                            pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0]
                        )
                    except queue.Full:
                        # in case the input job queue is full, keep clearing the
                        # result queue, to make sure we don't deadlock
                        process_result_queue()

                process_result_queue()
            # endfor single corpus pass

            # wait for all outstanding jobs to finish
            while queue_size[0] > 0:
                process_result_queue(force=True)

            if reallen != lencorpus:
                raise RuntimeError("input corpus size changed during training (don't use generators as input)")
        # endfor entire update

        pool.terminate()
示例#22
0
    def update(self,
               corpus,
               chunksize=None,
               decay=None,
               passes=None,
               update_every=None,
               eval_every=None,
               iterations=None,
               gamma_threshold=None):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached). `corpus` must be an iterable (repeatable stream of documents),

        In distributed mode, the E step is distributed over a cluster of machines.

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>.

        """
        # use parameters given in constructor, unless user explicitly overrode them
        if chunksize is None:
            chunksize = self.chunksize
        if decay is None:
            decay = self.decay
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every
        if eval_every is None:
            eval_every = self.eval_every
        if iterations is None:
            iterations = self.iterations
        if gamma_threshold is None:
            gamma_threshold = self.gamma_threshold

        # rho is the "speed" of updating; TODO try other fncs
        rho = lambda: pow(1.0 + self.num_updates, -decay)

        try:
            lencorpus = len(corpus)
        except:
            logger.warning(
                "input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaModel.update() called with an empty corpus")
            return

        self.state.numdocs += lencorpus

        if update_every:
            updatetype = "online"
            updateafter = min(lencorpus,
                              update_every * self.numworkers * chunksize)
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus,
                        (eval_every or 0) * self.numworkers * chunksize)

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info(
            "running %s LDA training, %s topics, %i passes over "
            "the supplied corpus of %i documents, updating model once "
            "every %i documents, evaluating perplexity every %i documents, "
            "iterating %ix with a convergence threshold of %f" %
            (updatetype, self.num_topics, passes, lencorpus, updateafter,
             evalafter, iterations, gamma_threshold))

        if updates_per_pass * passes < 10:
            logger.warning(
                "too few updates, training might not converge; consider "
                "increasing the number of passes or iterations to improve accuracy"
            )

        for pass_ in xrange(passes):
            if self.dispatcher:
                logger.info('initializing %s workers' % self.numworkers)
                self.dispatcher.reset(self.state)
            else:
                other = LdaState(self.eta, self.state.sstats.shape)
            dirty = False

            reallen = 0
            for chunk_no, chunk in enumerate(
                    utils.grouper(corpus, chunksize, as_numpy=True)):
                reallen += len(
                    chunk
                )  # keep track of how many documents we've processed so far

                if eval_every and ((reallen == lencorpus) or
                                   ((chunk_no + 1) %
                                    (eval_every * self.numworkers) == 0)):
                    self.log_perplexity(chunk, total_docs=lencorpus)

                if self.dispatcher:
                    # add the chunk to dispatcher's job queue, so workers can munch on it
                    logger.info(
                        'PROGRESS: pass %i, dispatching documents up to #%i/%i'
                        %
                        (pass_, chunk_no * chunksize + len(chunk), lencorpus))
                    # this will eventually block until some jobs finish, because the queue has a small finite length
                    self.dispatcher.putjob(chunk)
                else:
                    logger.info(
                        'PROGRESS: pass %i, at document #%i/%i' %
                        (pass_, chunk_no * chunksize + len(chunk), lencorpus))
                    gammat = self.do_estep(chunk, other)

                    if self.optimize_alpha:
                        self.update_alpha(gammat, rho)

                dirty = True
                del chunk

                # perform an M step. determine when based on update_every, don't do this after every chunk
                if update_every and (chunk_no + 1) % (update_every *
                                                      self.numworkers) == 0:
                    if self.dispatcher:
                        # distributed mode: wait for all workers to finish
                        logger.info(
                            "reached the end of input; now waiting for all remaining jobs to finish"
                        )
                        other = self.dispatcher.getstate()
                    self.do_mstep(rho(), other)
                    del other  # free up some mem

                    if self.dispatcher:
                        logger.info('initializing workers')
                        self.dispatcher.reset(self.state)
                    else:
                        other = LdaState(self.eta, self.state.sstats.shape)
                    dirty = False
            #endfor single corpus iteration
            if reallen != lencorpus:
                raise RuntimeError(
                    "input corpus size changed during training (don't use generators as input)"
                )

            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info(
                        "reached the end of input; now waiting for all remaining jobs to finish"
                    )
                    other = self.dispatcher.getstate()
                self.do_mstep(rho(), other)
                del other
                dirty = False
示例#23
0
    def update(self, corpus, chunksize=None, passes=None, eval_every=None):
        """Train the model with new documents.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        chunksize: int, optional
            Number of documents to be used in each training chunk.
        passes: int, optional
            Number of full passes over the training corpus.
            Leave at default `passes=1` if your input is an iterator.
        eval_every: int, optional
            Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low.

        """

        # use parameters given in constructor, unless user explicitly overrode them
        if passes is None:
            passes = self.passes
        if eval_every is None:
            eval_every = self.eval_every

        lencorpus = np.inf

        if isinstance(corpus, scipy.sparse.csc.csc_matrix):
            lencorpus = corpus.shape[1]
        else:
            try:
                lencorpus = len(corpus)
            except TypeError:
                logger.info("input corpus stream has no len()")

        if chunksize is None:
            chunksize = min(lencorpus, self.chunksize)

        evalafter = min(lencorpus, (eval_every or 0) * chunksize)

        if lencorpus == 0:
            logger.warning("Nmf.update() called with an empty corpus")
            return

        if isinstance(corpus, collections.Iterator) and self.passes > 1:
            raise ValueError(
                "Corpus is an iterator, only `passes=1` is valid.")

        logger.info(
            "running NMF training, %s topics, %i passes over the supplied corpus of %s documents, evaluating l2 norm "
            "every %i documents",
            self.num_topics,
            passes,
            lencorpus,
            evalafter,
        )

        chunk_overall_idx = 1

        for pass_ in range(passes):
            if isinstance(corpus, scipy.sparse.csc.csc_matrix):
                grouper = (
                    # Older scipy (0.19 etc) throw an error when slicing beyond the actual sparse array dimensions, so
                    # we clip manually with min() here.
                    corpus[:, col_idx:min(corpus.shape[1], col_idx +
                                          self.chunksize)]
                    for col_idx in range(0, corpus.shape[1], self.chunksize))
            else:
                grouper = utils.grouper(corpus, self.chunksize)

            for chunk_idx, chunk in enumerate(grouper):
                if isinstance(corpus, scipy.sparse.csc.csc_matrix):
                    v = chunk[:, self.random_state.permutation(chunk.shape[1])]

                    chunk_len = v.shape[1]
                else:
                    self.random_state.shuffle(chunk)

                    v = matutils.corpus2csc(
                        chunk,
                        num_terms=self.num_tokens,
                    )

                    chunk_len = len(chunk)

                logger.info("PROGRESS: pass %i, at document #%i/%s", pass_,
                            chunk_idx * chunksize + chunk_len, lencorpus)

                if self._W is None:
                    # If `self._W` is not set (i.e. the first batch being handled), compute the initial matrix using the
                    # batch mean.

                    self._setup(v)

                self._h = self._solveproj(v,
                                          self._W,
                                          h=self._h,
                                          v_max=self.v_max)
                h = self._h

                if eval_every and (((chunk_idx + 1) * chunksize >= lencorpus)
                                   or (chunk_idx + 1) % eval_every == 0):
                    logger.info("L2 norm: {}".format(self.l2_norm(v)))
                    self.print_topics(5)

                self.A *= chunk_overall_idx - 1
                self.A += h.dot(h.T)
                self.A /= chunk_overall_idx

                self.B *= chunk_overall_idx - 1
                self.B += v.dot(h.T)
                self.B /= chunk_overall_idx

                previous_w_error = self._w_error

                self._solve_w()

                chunk_overall_idx += 1

                logger.info("W error diff: {}".format(
                    (self._w_error - previous_w_error)))
示例#24
0
    def train(self, sentences, total_words=None, word_count=0, chunksize=100, total_examples=None, queue_factor=2, report_delay=1):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        For FastSent, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)

        To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples
        (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the
        sentences are the same as those that were used to initially build the vocabulary.

        """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("C extension not loaded for FastSent, training will be slow. "
                          "Install a C compiler and reinstall gensim for fast training.")
            self.neg_labels = []

        logger.info(
            "training model with %i workers on %i vocabulary and %i features, "
            "using sample=%s",
            self.workers, len(self.vocab), self.layer1_size,
            self.sample)

        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")
        if not hasattr(self, 'syn0'):
            raise RuntimeError("you must first finalize vocabulary before training the model")

        if total_words is None and total_examples is None:
            if self.corpus_count:
                total_examples = self.corpus_count
                logger.info("expecting %i examples, matching count from corpus used for vocabulary survey", total_examples)
            else:
                raise ValueError("you must provide either total_words or total_examples, to enable alpha and progress calculations")

        if self.iter > 1:
            sentences = utils.RepeatCorpusNTimes(sentences, self.iter)
            total_words = total_words and total_words * self.iter
            total_examples = total_examples and total_examples * self.iter

        def worker_init():
            work = matutils.zeros_aligned(self.layer1_size, dtype=REAL)  # per-thread private work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            return (work, neu1)

        def worker_one_job(job, inits):
            items, alpha = job
            if items is None:  # signal to finish
                return False
            # train & return tally
            tally, raw_tally = self._do_train_job(items, alpha, inits)
            progress_queue.put((len(items), tally, raw_tally))  # report progress
            return True

        # loop of a given worker: fetches the data from the queue and then
        # launches the worker_one_job function
        def worker_loop():
            """Train the model, lifting lists of sentences from the jobs queue."""
            init = worker_init()
            while True:
                job = job_queue.get()
                if not worker_one_job(job, init):
                    break

        start, next_report = default_timer(), 1.0

        # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        if self.workers > 0:
            job_queue = Queue(maxsize=queue_factor * self.workers)
        else:
            job_queue = FakeJobQueue(worker_init, worker_one_job)
        progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

        workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()
        pushed_words = 0
        pushed_examples = 0
        example_count = 0
        trained_word_count = 0
        raw_word_count = word_count
        push_done = False
        done_jobs = 0
        next_alpha = self.alpha
        jobs_source = enumerate(utils.grouper(sentences, chunksize))
        # fill jobs queue with (sentence, alpha) job tuples
        while True:
            try:
                job_no, items = next(jobs_source)
                logger.debug("putting job #%i in the queue at alpha %.05f", job_no, next_alpha)
                job_queue.put((items, next_alpha))
                # update the learning rate before every next job
                if self.min_alpha < next_alpha:
                    if total_examples:
                        # examples-based decay
                        pushed_examples += len(items)
                        next_alpha = self.alpha - (self.alpha - self.min_alpha) * (pushed_examples / total_examples)
                    else:
                        # words-based decay
                        pushed_words += self._raw_word_count(items)
                        next_alpha = self.alpha - (self.alpha - self.min_alpha) * (pushed_words / total_words)
                    next_alpha = max(next_alpha, self.min_alpha)
            except StopIteration:
                logger.info("reached end of input; waiting to finish %i outstanding jobs",
                    job_no - done_jobs + 1)
                for _ in xrange(self.workers):
                    job_queue.put((None, 0))  # give the workers heads up that they can finish -- no more work!
                push_done = True
            try:
                while done_jobs < (job_no+1) or not push_done:
                    examples, trained_words, raw_words = progress_queue.get(push_done)  # only block after all jobs pushed
                    example_count += examples
                    trained_word_count += trained_words  # only words in vocab & sampled
                    raw_word_count += raw_words
                    done_jobs += 1
                    elapsed = default_timer() - start
                    if elapsed >= next_report:
                        if total_examples:
                            # examples-based progress %
                            logger.info(
                                "FASTSENT MODEL PROGRESS: at %.2f%% examples, %.0f words/s",
                                100.0 * example_count / total_examples, trained_word_count / elapsed)
                        else:
                            # words-based progress %
                            logger.info(
                                "FASTSENT MODEL PROGRESS: at %.2f%% words, %.0f words/s",
                                100.0 * raw_word_count / total_words, trained_word_count / elapsed)
                        next_report = elapsed + report_delay  # don't flood log, wait report_delay seconds
                else:
                    # loop ended by job count; really done
                    break
            except Empty:
                pass  # already out of loop; continue to next push

        elapsed = default_timer() - start
        logger.info(
            "training on %i raw words took %.1fs, %.0f trained words/s",
            raw_word_count, elapsed, trained_word_count / elapsed if elapsed else 0.0)

        if total_examples and total_examples != example_count:
            logger.warn("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples)
        if total_words and total_words != raw_word_count:
            logger.warn("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words)

        self.train_count += 1  # number of times train() has been called
        self.total_train_time += elapsed
        self.clear_sims()
        return trained_word_count
示例#25
0
    def train(self, sentences, total_words=None, word_count=0, chunksize=100):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of utf8 strings.

        """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn(
                "Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`"
            )
        logger.info(
            "training model with %i workers on %i vocabulary and %i features" %
            (self.workers, len(self.vocab), self.layer1_size))

        if not self.vocab:
            raise RuntimeError(
                "you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        word_count, total_words = [
            word_count
        ], total_words or sum(v.count for v in itervalues(self.vocab))
        jobs = Queue(
            maxsize=2 * self.workers
        )  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock(
        )  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = zeros(
                self.layer1_size,
                dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(
                    self.min_alpha,
                    self.alpha * (1 - 1.0 * word_count[0] / total_words))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                if self.sg:
                    job_words = sum(
                        train_sentence_sg(self, sentence, alpha, work)
                        for sentence in job)
                else:
                    job_words = sum(
                        train_sentence_cbow(self, sentence, alpha, work, neu1)
                        for sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info(
                            "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s"
                            % (100.0 * word_count[0] / total_words, alpha,
                               word_count[0] / elapsed if elapsed else 0.0))
                        next_report[
                            0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [
            threading.Thread(target=worker_train) for _ in xrange(self.workers)
        ]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue
        no_oov = ([self.vocab.get(word, None) for word in sentence]
                  for sentence in sentences)
        for job_no, job in enumerate(utils.grouper(no_oov, chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" %
                         (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info(
            "reached the end of input; waiting to finish %i outstanding jobs" %
            jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(
                None
            )  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
                    (word_count[0], elapsed,
                     word_count[0] / elapsed if elapsed else 0.0))

        return word_count[0]
示例#26
0
文件: ldamodel.py 项目: ChicoQ/gensim
    def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=None):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached).

        In distributed mode, the E step is distributed over a cluster of machines.

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>.
        """
        # use parameters given in constructor, unless user explicitly overrode them
        if chunksize is None:
            chunksize = self.chunksize
        if decay is None:
            decay = self.decay
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every

        # rho is the "speed" of updating; TODO try other fncs
        rho = lambda: pow(1.0 + self.num_updates, -decay)

        try:
            lencorpus = len(corpus)
        except:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaModel.update() called with an empty corpus")
            return
        self.state.numdocs += lencorpus

        if update_every > 0:
            updatetype = "online"
            updateafter = min(lencorpus, update_every * self.numworkers * chunksize)
        else:
            updatetype = "batch"
            updateafter = lencorpus

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info("running %s LDA training, %s topics, %i passes over "
                    "the supplied corpus of %i documents, updating model once "
                    "every %i documents" %
                    (updatetype, self.num_topics, passes, lencorpus, updateafter))
        if updates_per_pass * passes < 10:
            logger.warning("too few updates, training might not converge; consider "
                           "increasing the number of passes to improve accuracy")

        for iteration in xrange(passes):
            if self.dispatcher:
                logger.info('initializing %s workers' % self.numworkers)
                self.dispatcher.reset(self.state)
            else:
                other = LdaState(self.eta, self.state.sstats.shape)
            dirty = False

            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=True)):
                if self.dispatcher:
                    # add the chunk to dispatcher's job queue, so workers can munch on it
                    logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' %
                                (iteration, chunk_no * chunksize + len(chunk), lencorpus))
                    # this will eventually block until some jobs finish, because the queue has a small finite length
                    self.dispatcher.putjob(chunk)
                else:
                    logger.info('PROGRESS: iteration %i, at document #%i/%i' %
                                (iteration, chunk_no * chunksize + len(chunk), lencorpus))
                    self.do_estep(chunk, other)
                dirty = True
                del chunk

                # perform an M step. determine when based on update_every, don't do this after every chunk
                if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
                    if self.dispatcher:
                        # distributed mode: wait for all workers to finish
                        logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                        other = self.dispatcher.getstate()
                    self.do_mstep(rho(), other)
                    del other # free up some mem

                    if self.dispatcher:
                        logger.info('initializing workers')
                        self.dispatcher.reset(self.state)
                    else:
                        other = LdaState(self.eta, self.state.sstats.shape)
                    dirty = False
            #endfor single corpus iteration

            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    other = self.dispatcher.getstate()
                self.do_mstep(rho(), other)
                del other
                dirty = False
示例#27
0
    def update(self, corpus, chunksize=None, decay=None, offset=None,
               passes=None, update_every=None, eval_every=None, iterations=None,
               gamma_threshold=None):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached). `corpus` must be an iterable (repeatable stream of documents),

        In distributed mode, the E step is distributed over a cluster of machines.

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>. Additionally, for smaller
        `corpus` sizes, an increasing `offset` may be beneficial (see
        Table 1 in Hoffman et al.)

        """
        # use parameters given in constructor, unless user explicitly overrode them
        if chunksize is None:
            chunksize = self.chunksize
        if decay is None:
            decay = self.decay
        if offset is None:
            offset = self.offset
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every
        if eval_every is None:
            eval_every = self.eval_every
        if iterations is None:
            iterations = self.iterations
        if gamma_threshold is None:
            gamma_threshold = self.gamma_threshold

        # rho is the "speed" of updating; TODO try other fncs
        rho = lambda: pow(offset + self.num_updates / self.chunksize, -decay)

        try:
            lencorpus = len(corpus)
        except:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaModel.update() called with an empty corpus")
            return

        self.state.numdocs += lencorpus

        if update_every:
            updatetype = "online"
            updateafter = min(lencorpus, update_every * self.numworkers * chunksize)
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize)

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info("running %s LDA training, %s topics, %i passes over "
                    "the supplied corpus of %i documents, updating model once "
                    "every %i documents, evaluating perplexity every %i documents, "
                    "iterating %ix with a convergence threshold of %f" %
                    (updatetype, self.num_topics, passes, lencorpus,
                        updateafter, evalafter, iterations,
                        gamma_threshold))

        if updates_per_pass * passes < 10:
            logger.warning("too few updates, training might not converge; consider "
                           "increasing the number of passes or iterations to improve accuracy")

        for pass_ in xrange(passes):
            if self.dispatcher:
                logger.info('initializing %s workers' % self.numworkers)
                self.dispatcher.reset(self.state)
            else:
                other = LdaState(self.eta, self.state.sstats.shape)
            dirty = False

            reallen = 0
            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=True)):
                reallen += len(chunk)  # keep track of how many documents we've processed so far

                if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)):
                    self.log_perplexity(chunk, total_docs=lencorpus)

                if self.dispatcher:
                    # add the chunk to dispatcher's job queue, so workers can munch on it
                    logger.info('PROGRESS: pass %i, dispatching documents up to #%i/%i' %
                                (pass_, chunk_no * chunksize + len(chunk), lencorpus))
                    # this will eventually block until some jobs finish, because the queue has a small finite length
                    self.dispatcher.putjob(chunk)
                else:
                    logger.info('PROGRESS: pass %i, at document #%i/%i' %
                                (pass_, chunk_no * chunksize + len(chunk), lencorpus))
                    gammat = self.do_estep(chunk, other)

                    if self.optimize_alpha:
                        self.update_alpha(gammat, rho)

                dirty = True
                del chunk

                # perform an M step. determine when based on update_every, don't do this after every chunk
                if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
                    if self.dispatcher:
                        # distributed mode: wait for all workers to finish
                        logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                        other = self.dispatcher.getstate()
                    self.do_mstep(rho(), other)
                    del other # free up some mem

                    if self.dispatcher:
                        logger.info('initializing workers')
                        self.dispatcher.reset(self.state)
                    else:
                        other = LdaState(self.eta, self.state.sstats.shape)
                    dirty = False
            #endfor single corpus iteration
            if reallen != lencorpus:
                raise RuntimeError("input corpus size changed during training (don't use generators as input)")

            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    other = self.dispatcher.getstate()
                self.do_mstep(rho(), other)
                del other
                dirty = False
示例#28
0
    def update(self, corpus, chunksize=None, passes=None, eval_every=None):
        """Train the model with new documents.

        Parameters
        ----------
        corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents)
            Training corpus.
            Can be either iterable of documents, which are lists of `(word_id, word_count)`,
            or a sparse csc matrix of BOWs for each document.
            If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`).
        chunksize: int, optional
            Number of documents to be used in each training chunk.
        passes: int, optional
            Number of full passes over the training corpus.
            Leave at default `passes=1` if your input is an iterator.
        eval_every: int, optional
            Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low.

        """

        # use parameters given in constructor, unless user explicitly overrode them
        if passes is None:
            passes = self.passes
        if eval_every is None:
            eval_every = self.eval_every

        lencorpus = np.inf

        if isinstance(corpus, scipy.sparse.csc.csc_matrix):
            lencorpus = corpus.shape[1]
        else:
            try:
                lencorpus = len(corpus)
            except TypeError:
                logger.info("input corpus stream has no len()")

        if chunksize is None:
            chunksize = min(lencorpus, self.chunksize)

        evalafter = min(lencorpus, (eval_every or 0) * chunksize)

        if lencorpus == 0:
            logger.warning("Nmf.update() called with an empty corpus")
            return

        if isinstance(corpus, collections.Iterator) and self.passes > 1:
            raise ValueError("Corpus is an iterator, only `passes=1` is valid.")

        logger.info(
            "running NMF training, %s topics, %i passes over the supplied corpus of %s documents, evaluating l2 norm "
            "every %i documents",
            self.num_topics, passes, lencorpus, evalafter,
        )

        chunk_overall_idx = 1

        for pass_ in range(passes):
            if isinstance(corpus, scipy.sparse.csc.csc_matrix):
                grouper = (
                    # Older scipy (0.19 etc) throw an error when slicing beyond the actual sparse array dimensions, so
                    # we clip manually with min() here.

                    corpus[:, col_idx:min(corpus.shape[1], col_idx + self.chunksize)]
                    for col_idx
                    in range(0, corpus.shape[1], self.chunksize)
                )
            else:
                grouper = utils.grouper(corpus, self.chunksize)

            for chunk_idx, chunk in enumerate(grouper):
                if isinstance(corpus, scipy.sparse.csc.csc_matrix):
                    v = chunk[:, self.random_state.permutation(chunk.shape[1])]

                    chunk_len = v.shape[1]
                else:
                    self.random_state.shuffle(chunk)

                    v = matutils.corpus2csc(
                        chunk,
                        num_terms=self.num_tokens,
                    )

                    chunk_len = len(chunk)

                logger.info(
                    "PROGRESS: pass %i, at document #%i/%s",
                    pass_, chunk_idx * chunksize + chunk_len, lencorpus
                )

                if self._W is None:
                    # If `self._W` is not set (i.e. the first batch being handled), compute the initial matrix using the
                    # batch mean.

                    self._setup(v)

                self._h = self._solveproj(v, self._W, h=self._h, v_max=self.v_max)
                h = self._h

                if eval_every and (((chunk_idx + 1) * chunksize >= lencorpus) or (chunk_idx + 1) % eval_every == 0):
                    logger.info("L2 norm: {}".format(self.l2_norm(v)))
                    self.print_topics(5)

                self.A *= chunk_overall_idx - 1
                self.A += h.dot(h.T)
                self.A /= chunk_overall_idx

                self.B *= chunk_overall_idx - 1
                self.B += v.dot(h.T)
                self.B /= chunk_overall_idx

                previous_w_error = self._w_error

                self._solve_w()

                chunk_overall_idx += 1

                logger.info("W error diff: {}".format((self._w_error - previous_w_error)))
示例#29
0
    def add_documents(self, corpus, chunksize=None, decay=None):
        """
        Update singular value decomposition to take into account a new
        corpus of documents.

        Training proceeds in chunks of `chunksize` documents at a time. The size of
        `chunksize` is a tradeoff between increased speed (bigger `chunksize`)
        vs. lower memory footprint (smaller `chunksize`). If the distributed mode
        is on, each chunk is sent to a different worker/computer.

        Setting `decay` < 1.0 causes re-orientation towards new data trends in the
        input document stream, by giving less emphasis to old observations. This allows
        LSA to gradually "forget" old observations (documents) and give more
        preference to new ones.
        """
        logger.info("updating model with new documents")

        # get computation parameters; if not specified, use the ones from constructor
        if chunksize is None:
            chunksize = self.chunksize
        if decay is None:
            decay = self.decay

        if not scipy.sparse.issparse(corpus):
            if not self.onepass:
                # we are allowed multiple passes over the input => use a faster, randomized two-pass algo
                update = Projection(self.num_terms, self.num_topics, None)
                update.u, update.s = stochastic_svd(corpus, self.num_topics,
                    num_terms=self.num_terms, chunksize=chunksize,
                    extra_dims=self.extra_samples, power_iters=self.power_iters)
                self.projection.merge(update, decay=decay)
            else:
                # the one-pass algo
                doc_no = 0
                if self.dispatcher:
                    logger.info('initializing %s workers' % self.numworkers)
                    self.dispatcher.reset()
                for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
                    logger.info("preparing a new chunk of documents")
                    nnz = sum(len(doc) for doc in chunk)
                    # construct the job as a sparse matrix, to minimize memory overhead
                    # definitely avoid materializing it as a dense matrix!
                    logger.debug("converting corpus to csc format")
                    job = matutils.corpus2csc(chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz)
                    del chunk
                    doc_no += job.shape[1]
                    if self.dispatcher:
                        # distributed version: add this job to the job queue, so workers can work on it
                        logger.debug("creating job #%i" % chunk_no)
                        self.dispatcher.putjob(job) # put job into queue; this will eventually block, because the queue has a small finite size
                        del job
                        logger.info("dispatched documents up to #%s" % doc_no)
                    else:
                        # serial version, there is only one "worker" (myself) => process the job directly
                        update = Projection(self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, power_iters=self.power_iters)
                        del job
                        self.projection.merge(update, decay=decay)
                        del update
                        logger.info("processed documents up to #%s" % doc_no)
                        self.print_topics(5)

                # wait for all workers to finish (distributed version only)
                if self.dispatcher:
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    self.projection = self.dispatcher.getstate()
#            logger.info("top topics after adding %i documents" % doc_no)
#            self.print_debug(10)
        else:
            assert not self.dispatcher, "must be in serial mode to receive jobs"
            assert self.onepass, "distributed two-pass algo not supported yet"
            update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters)
            self.projection.merge(update, decay=decay)
            logger.info("processed sparse job of %i documents" % (corpus.shape[1]))
示例#30
0
    def train(self, instances, total_feats=None, feat_count=0, chunksize=100):
        """
        Update the model's neural weights from a sequence of instances.
        Each instance must be a list of unicode strings or ints (indices).

        """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`")
        logger.info("training model with %i workers on %i vocabulary and %i embedding size"
            ", and 'negative sampling'=%s" %
            (self.workers, len(self.vocab), self.layer1_size, self.negative))

        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        feat_count = [feat_count]
        total_feats = total_feats or int(sum(v.count for v in itervalues(self.vocab)))
        jobs = Queue(maxsize=2 * self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock()  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of instances from the jobs queue."""
            '''
            multiple working space
            '''
            work = zeros(self.layer1_size, dtype=REAL)  # each thread must have its own work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * feat_count[0] / total_feats))
                # how many words did we train on? out-of-vocabulary (unknown) features do not count
                job_words = sum(train_instance(self, instance, alpha, work) for instance in job)
                with lock:
                    feat_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info("PROGRESS: at %.2f%% features, alpha %.05f, %.0f features/s" %
                            (100.0 * feat_count[0] / total_feats, alpha, feat_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        def prepare_instances():
            for instance in instances:
                sampled = [self.vocab[feat] for feat in instance
                    if feat in self.vocab]
                yield sampled

        # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
        for job_no, job in enumerate(utils.grouper(prepare_instances(), chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(None)  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i features took %.1fs, %.0f features/s" %
            (feat_count[0], elapsed, feat_count[0] / elapsed if elapsed else 0.0))

        return feat_count[0]
示例#31
0
    def update(self, corpus):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached). `corpus` must be an iterable (repeatable stream of documents),

        The E-step is distributed into the several processes.

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>.

        """
        # rho is the "speed" of updating, decelerating over time
        rho = lambda: pow(1.0 + self.num_updates / self.chunksize, -self.decay)

        try:
            lencorpus = len(corpus)
        except:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaMulticore.update() called with an empty corpus")
            return

        self.state.numdocs += lencorpus

        if not self.batch:
            updatetype = "online"
            updateafter = self.chunksize * self.workers
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus, (self.eval_every * updateafter or 0))

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info("running %s LDA training, %s topics, %i passes over the"
            " supplied corpus of %i documents, updating every %i documents,"
            " evaluating every ~%i documents, iterating %ix with a convergence threshold of %f",
            updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter,
            self.iterations, self.gamma_threshold)

        if updates_per_pass * self.passes < 10:
            logger.warning("too few updates, training might not converge; consider "
                "increasing the number of passes or iterations to improve accuracy")

        def worker_e_step(input_queue, result_queue):
            """
            Perform E-step for each (chunk_no, chunk, model) 3-tuple from the
            input queue, placing the resulting state into the result queue.

            """
            logger.debug("worker process entering E-step loop")
            while True:
                logger.debug("getting a new job")
                chunk_no, chunk, worker_lda = input_queue.get()
                logger.debug("processing chunk #%i of %i documents", chunk_no, len(chunk))
                worker_lda.state.reset()
                worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
                del chunk
                logger.debug("processed chunk, queuing the result")
                result_queue.put(worker_lda.state)
                del worker_lda  # free up some memory
                logger.debug("result put")

        job_queue = Queue(maxsize=2 * self.workers)
        result_queue = Queue()

        logger.info("training LDA model using %i processes", self.workers)
        pool = Pool(self.workers, worker_e_step, (job_queue, result_queue,))
        for pass_ in xrange(self.passes):
            queue_size, reallen = [0], 0
            other = LdaState(self.eta, self.state.sstats.shape)

            def process_result_queue(force=False):
                """
                Clear the result queue, merging all intermediate results, and update the
                LDA model if necessary.

                """
                merged_new = False
                while not result_queue.empty():
                    other.merge(result_queue.get())
                    queue_size[0] -= 1
                    merged_new = True
                if (force and merged_new and queue_size[0] == 0) or (not self.batch and (other.numdocs >= updateafter)):
                    self.do_mstep(rho(), other)
                    other.reset()
                    if self.eval_every is not None and ((force and queue_size[0] == 0) or (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)):
                        self.log_perplexity(chunk, total_docs=lencorpus)

            chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=True)
            for chunk_no, chunk in enumerate(chunk_stream):
                reallen += len(chunk)  # keep track of how many documents we've processed so far

                # put the chunk into the workers' input job queue
                chunk_put = False
                while not chunk_put:
                    try:
                        job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1)
                        chunk_put = True
                        queue_size[0] += 1
                        logger.info('PROGRESS: pass %i, dispatched chunk #%i = '
                            'documents up to #%i/%i, outstanding queue size %i',
                            pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0])
                    except Full:
                        # in case the input job queue is full, keep clearing the
                        # result queue, to make sure we don't deadlock
                        process_result_queue()

                process_result_queue()
            #endfor single corpus pass

            # wait for all outstanding jobs to finish
            while queue_size[0] > 0:
                process_result_queue(force=True)

            if reallen != lencorpus:
                raise RuntimeError("input corpus size changed during training (don't use generators as input)")
        #endfor entire update

        pool.close()
示例#32
0
    def add_documents(self, corpus, chunksize=None, decay=None):
        """Update model with new `corpus`.

        Parameters
        ----------
        corpus : {iterable of list of (int, float), scipy.sparse.csc}
            Stream of document vectors or sparse matrix of shape (`num_terms`, num_documents).
        chunksize : int, optional
            Number of documents to be used in each training chunk, will use `self.chunksize` if not specified.
        decay : float, optional
            Weight of existing observations relatively to new ones,  will use `self.decay` if not specified.

        Notes
        -----
        Training proceeds in chunks of `chunksize` documents at a time. The size of `chunksize` is a tradeoff
        between increased speed (bigger `chunksize`) vs. lower memory footprint (smaller `chunksize`).
        If the distributed mode is on, each chunk is sent to a different worker/computer.

        """
        logger.info("updating model with new documents")

        # get computation parameters; if not specified, use the ones from constructor
        if chunksize is None:
            chunksize = self.chunksize
        if decay is None:
            decay = self.decay

        if not scipy.sparse.issparse(corpus):
            if not self.onepass:
                # we are allowed multiple passes over the input => use a faster, randomized two-pass algo
                update = Projection(self.num_terms, self.num_topics, None, dtype=self.dtype)
                update.u, update.s = stochastic_svd(
                    corpus, self.num_topics,
                    num_terms=self.num_terms, chunksize=chunksize,
                    extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype
                )
                self.projection.merge(update, decay=decay)
                self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0
            else:
                # the one-pass algo
                doc_no = 0
                if self.dispatcher:
                    logger.info('initializing %s workers', self.numworkers)
                    self.dispatcher.reset()
                for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
                    logger.info("preparing a new chunk of documents")
                    nnz = sum(len(doc) for doc in chunk)
                    # construct the job as a sparse matrix, to minimize memory overhead
                    # definitely avoid materializing it as a dense matrix!
                    logger.debug("converting corpus to csc format")
                    job = matutils.corpus2csc(
                        chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz, dtype=self.dtype)
                    del chunk
                    doc_no += job.shape[1]
                    if self.dispatcher:
                        # distributed version: add this job to the job queue, so workers can work on it
                        logger.debug("creating job #%i", chunk_no)
                        # put job into queue; this will eventually block, because the queue has a small finite size
                        self.dispatcher.putjob(job)
                        del job
                        logger.info("dispatched documents up to #%s", doc_no)
                    else:
                        # serial version, there is only one "worker" (myself) => process the job directly
                        update = Projection(
                            self.num_terms, self.num_topics, job, extra_dims=self.extra_samples,
                            power_iters=self.power_iters, dtype=self.dtype
                        )
                        del job
                        self.projection.merge(update, decay=decay)
                        del update
                        logger.info("processed documents up to #%s", doc_no)
                        self.print_topics(5)

                # wait for all workers to finish (distributed version only)
                if self.dispatcher:
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    self.projection = self.dispatcher.getstate()
                self.docs_processed += doc_no
        else:
            assert not self.dispatcher, "must be in serial mode to receive jobs"
            update = Projection(
                self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples,
                power_iters=self.power_iters, dtype=self.dtype
            )
            self.projection.merge(update, decay=decay)
            logger.info("processed sparse job of %i documents", corpus.shape[1])
            self.docs_processed += corpus.shape[1]
parser.add_argument('-next',
                    '--next',
                    type=str,
                    default='next_bookcorpus.shlf')
parser.add_argument('-conj',
                    '--conj',
                    type=str,
                    default='conj_bookcorpus.shlf')
# parser.add_argument('-log_file', '--log_file', type=str, default='')

args = parser.parse_args()
sentences = MySentences(args.corpus)
log_file = pjoin('process_' + get_time_str() + '.log')
logging.basicConfig(filename=log_file, level=logging.DEBUG)
chunksize = 100
groups = enumerate(utils.grouper(sentences, chunksize))
n_sentence = 0
order_ = []
next_ = []
conj_ = []

while True:
    try:
        sentence_no, items = next(groups)
        o, n, c = make_all_tasks(items)
        order_ += o
        next_ += n
        conj_ += c
        logging.info("%s \t  %d %d %d %d", get_time_str(),
                     sentence_no * chunksize, len(order_), len(next_),
                     len(conj_))
示例#34
0
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None,
                  power_iters=0, dtype=numpy.float64, eps=1e-6):
    """
    Run truncated Singular Value Decomposition (SVD) on a sparse input.

    Return (U, S): the left singular vectors and the singular values of the input
    data stream `corpus` [4]_. The corpus may be larger than RAM (iterator of vectors).

    This may return less than the requested number of top `rank` factors, in case
    the input itself is of lower rank. The `extra_dims` (oversampling) and especially
    `power_iters` (power iterations) parameters affect accuracy of the decomposition.

    This algorithm uses `2+power_iters` passes over the input data. In case you can only
    afford a single pass, set `onepass=True` in :class:`LsiModel` and avoid using
    this function directly.

    The decomposition algorithm is based on
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**

    .. [4] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole
       corpus fits into core memory and a different (more efficient) code path is chosen.
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters))

    num_terms = int(num_terms)

    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunksize` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st phase: constructing %s action matrix" % str(y.shape))

    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms)
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices,
                                corpus.data, o.ravel(), y.ravel()) # y = corpus * o
        del o

        # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype!
        # so check for equal dtype explicitly, to avoid the extra memory footprint if possible
        if y.dtype != dtype:
            y = y.astype(dtype)

        logger.info("orthonormalizing %s action matrix" % str(y.shape))
        y = [y]
        q, _ = matutils.qr_destroy(y) # orthonormalize the range

        logger.debug("running %i power iterations" % power_iters)
        for power_iter in xrange(power_iters):
            q = corpus.T * q
            q = [corpus * q]
            q, _ = matutils.qr_destroy(q) # orthonormalize the range after each power iteration step
    else:
        num_docs = 0
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i' % (chunk_no * chunksize))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunksize) matrix!
            s = sum(len(doc) for doc in chunk)
            chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunksize # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix
            sparsetools.csc_matvecs(m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o
                                    chunk.data, o.ravel(), y.ravel())
            del chunk, o
        y = [y]
        q, _ = matutils.qr_destroy(y) # orthonormalize the range

        for power_iter in xrange(power_iters):
            logger.info("running power iteration #%i" % (power_iter + 1))
            yold = q.copy()
            q[:] = 0.0
            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
                logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs))
                chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                q += tmp
            del yold
            q = [q]
            q, _ = matutils.qr_destroy(q) # orthonormalize the range

    qt = q[:, :samples].T.copy()
    del q

    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape))
        u, s, vt = scipy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunksize` documents from the streaming
        # input corpus A, to avoid using O(number of documents) memory
        x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64)
        logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape))
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs))
            chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype)
            b = qt * chunk # dense * sparse matrix multiply
            del chunk
            x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del b

        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix" % str(x.shape))
        u, s, vt = scipy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
    q = qt.T.copy()
    del qt

    logger.info("computing the final decomposition")
    keep = clip_spectrum(s**2, rank, discard=eps)
    u = u[:, :keep].copy()
    s = s[:keep]
    u = numpy.dot(q, u)
    return u.astype(dtype), s.astype(dtype)
示例#35
0
    def add_documents(self, corpus, chunksize=None, decay=None):
        """
        Update singular value decomposition to take into account a new
        corpus of documents.

        Training proceeds in chunks of `chunksize` documents at a time. The size of
        `chunksize` is a tradeoff between increased speed (bigger `chunksize`)
        vs. lower memory footprint (smaller `chunksize`). If the distributed mode
        is on, each chunk is sent to a different worker/computer.

        Setting `decay` < 1.0 causes re-orientation towards new data trends in the
        input document stream, by giving less emphasis to old observations. This allows
        LSA to gradually "forget" old observations (documents) and give more
        preference to new ones.
        """
        logger.info("updating model with new documents")

        # get computation parameters; if not specified, use the ones from constructor
        if chunksize is None:
            chunksize = self.chunksize
        if decay is None:
            decay = self.decay

        if not scipy.sparse.issparse(corpus):
            if not self.onepass:
                # we are allowed multiple passes over the input => use a faster, randomized two-pass algo
                update = Projection(self.num_terms, self.num_topics, None)
                update.u, update.s = stochastic_svd(
                    corpus,
                    self.num_topics,
                    num_terms=self.num_terms,
                    chunksize=chunksize,
                    extra_dims=self.extra_samples,
                    power_iters=self.power_iters)
                self.projection.merge(update, decay=decay)
            else:
                # the one-pass algo
                doc_no = 0
                for chunk_no, chunk in enumerate(
                        utils.grouper(corpus, chunksize)):
                    logger.info("preparing a new chunk of documents")
                    nnz = sum(len(doc) for doc in chunk)
                    # construct the job as a sparse matrix, to minimize memory overhead
                    # definitely avoid materializing it as a dense matrix!
                    logger.debug("converting corpus to csc format")
                    job = matutils.corpus2csc(chunk,
                                              num_docs=len(chunk),
                                              num_terms=self.num_terms,
                                              num_nnz=nnz)
                    del chunk
                    doc_no += job.shape[1]
                    if self.dispatcher:
                        # distributed version: add this job to the job queue, so workers can work on it
                        logger.debug("creating job #%i" % chunk_no)
                        self.dispatcher.putjob(
                            job
                        )  # put job into queue; this will eventually block, because the queue has a small finite size
                        del job
                        logger.info("dispatched documents up to #%s" % doc_no)
                    else:
                        # serial version, there is only one "worker" (myself) => process the job directly
                        update = Projection(self.num_terms,
                                            self.num_topics,
                                            job,
                                            extra_dims=self.extra_samples,
                                            power_iters=self.power_iters)
                        del job
                        self.projection.merge(update, decay=decay)
                        del update
                        logger.info("processed documents up to #%s" % doc_no)
                        self.print_topics(5)

                # wait for all workers to finish (distributed version only)
                if self.dispatcher:
                    logger.info(
                        "reached the end of input; now waiting for all remaining jobs to finish"
                    )
                    self.projection = self.dispatcher.getstate()
#            logger.info("top topics after adding %i documents" % doc_no)
#            self.print_debug(10)
        else:
            assert not self.dispatcher, "must be in serial mode to receive jobs"
            assert self.onepass, "distributed two-pass algo not supported yet"
            update = Projection(self.num_terms,
                                self.num_topics,
                                corpus.tocsc(),
                                extra_dims=self.extra_samples,
                                power_iters=self.power_iters)
            self.projection.merge(update, decay=decay)
            logger.info("processed sparse job of %i documents" %
                        (corpus.shape[1]))
示例#36
0
	def train(self, sentences, total_words=None, word_count=0, paragraphs_only = False, vocab = None, paragraphs = None):
		"""
		Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
		Each sentence must be a list of utf8 strings.

		"""
		if paragraphs is None:
			paragraphs = self.synparagraph
		if vocab is None:
			vocab = self.paragraph_vocab

		if not self.vocab:
			raise RuntimeError("you must first build vocabulary before training the model")

		start, next_report = time.time(), [1.0]
		word_count, total_words = [word_count], total_words or sum(v.count for v in itervalues(self.vocab))
		jobs = Queue(maxsize=2 * self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
		lock = threading.Lock()  # for shared state (=number of words trained so far, log reports...)
		total_error = [0.0]

		def worker_train():
			"""Train the model, lifting lists of sentences from the jobs queue."""
			paragraph_work = zeros(self.paragraph_size, dtype=REAL)  # each thread must have its own work memory
			error = zeros(1, dtype = REAL)
			if self.concatenate:
				# word work here is for each individual word, so it has length logistic regression - para size
				word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL)
			else:
				# here word work is aggregated:
				word_work = zeros(self.layer1_size, dtype = REAL)
				neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

			zeros(self.logistic_regression_size, dtype = REAL)
			while True:
				job = jobs.get()
				if job is None:  # data finished, exit
					break
				# update the learning rate before every job
				alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha
				# how many words did we train on? out-of-vocabulary (unknown) words do not count
				job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job))

				with lock:
					# here we can store the scores for later plotting and viewing...
					word_count[0] += job_words

					elapsed = time.time() - start
					total_error[0] += error[0]
					if elapsed >= next_report[0]:
						logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," %
							(100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
						next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

		workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
		for thread in workers:
			thread.daemon = True  # make interrupting the process with ctrl+c easier
			thread.start()

		# convert input strings to Vocab objects, and paragraph to paragraph (Vocab) object:
		no_oov = (self.create_job(sentence,vocab) for sentence in sentences)
		for job_no, job in enumerate(utils.grouper(no_oov, self.batchsize)):
			logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
			jobs.put(job)
		logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
		for _ in xrange(self.workers):
			jobs.put(None)  # give the workers heads up that they can finish -- no more work!

		for thread in workers:
			thread.join()

		elapsed = time.time() - start
		logger.info("training on %i sentences took %.1fs, %.0f sentences/s, %.6f" %
			(word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0, total_error[0]))

		return (word_count[0], total_error[0])
示例#37
0
def stochastic_svd(corpus,
                   rank,
                   num_terms,
                   chunksize=20000,
                   extra_dims=None,
                   power_iters=0,
                   dtype=numpy.float64,
                   eps=1e-6):
    """
    Return (U, S): the left singular vectors and the singular values of the streamed
    input corpus `corpus` [3]_.

    This may actually return less than the requested number of top `rank` factors,
    in case the input is of lower rank. The `extra_dims` (oversampling) and especially
    `power_iters` (power iterations) parameters affect accuracy of the decomposition.

    This algorithm uses `2+power_iters` passes over the data. In case you can only
    afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel`
    and avoid using this algorithm directly.

    The decomposition algorithm is based on
    **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.**

    .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole
       corpus fits into core memory and a different (more efficient) code path is chosen.
    """
    rank = int(rank)
    if extra_dims is None:
        samples = max(
            10, 2 * rank
        )  # use more samples than requested factors, to improve accuracy
    else:
        samples = rank + int(extra_dims)
    logger.info("using %i extra samples and %i power iterations" %
                (samples - rank, power_iters))

    num_terms = int(num_terms)

    # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O)
    # build Y in blocks of `chunksize` documents (much faster than going one-by-one
    # and more memory friendly than processing all documents at once)
    y = numpy.zeros(dtype=dtype, shape=(num_terms, samples))
    logger.info("1st phase: constructing %s action matrix" % str(y.shape))

    if scipy.sparse.issparse(corpus):
        m, n = corpus.shape
        assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (
            m, num_terms)
        o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
            y.dtype)  # draw a random gaussian matrix
        sparsetools.csc_matvecs(m, n, samples, corpus.indptr,
                                corpus.indices, corpus.data, o.ravel(),
                                y.ravel())  # y = corpus * o
        del o

        # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype!
        # so check for equal dtype explicitly, to avoid the extra memory footprint if possible
        if y.dtype != dtype:
            y = y.astype(dtype)

        logger.info("orthonormalizing %s action matrix" % str(y.shape))
        y = [y]
        q, _ = matutils.qr_destroy(y)  # orthonormalize the range

        logger.debug("running %i power iterations" % power_iters)
        for power_iter in xrange(power_iters):
            q = corpus.T * q
            q = [corpus * q]
            q, _ = matutils.qr_destroy(
                q)  # orthonormalize the range after each power iteration step
    else:
        num_docs = 0
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i' % (chunk_no * chunksize))
            # construct the chunk as a sparse matrix, to minimize memory overhead
            # definitely avoid materializing it as a dense (num_terms x chunksize) matrix!
            s = sum(len(doc) for doc in chunk)
            chunk = matutils.corpus2csc(
                chunk, num_terms=num_terms,
                dtype=dtype)  # documents = columns of sparse CSC
            m, n = chunk.shape
            assert m == num_terms
            assert n <= chunksize  # the very last chunk of A is allowed to be smaller in size
            num_docs += n
            logger.debug("multiplying chunk * gauss")
            o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(
                dtype)  # draw a random gaussian matrix
            sparsetools.csc_matvecs(
                m,
                n,
                samples,
                chunk.indptr,
                chunk.indices,  # y = y + chunk * o
                chunk.data,
                o.ravel(),
                y.ravel())
            del chunk, o
        y = [y]
        q, _ = matutils.qr_destroy(y)  # orthonormalize the range

        for power_iter in xrange(power_iters):
            logger.info("running power iteration #%i" % (power_iter + 1))
            yold = q.copy()
            q[:] = 0.0
            for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
                logger.info('PROGRESS: at document #%i/%i' %
                            (chunk_no * chunksize, num_docs))
                chunk = matutils.corpus2csc(
                    chunk, num_terms=num_terms,
                    dtype=dtype)  # documents = columns of sparse CSC
                tmp = chunk.T * yold
                tmp = chunk * tmp
                del chunk
                q += tmp
            del yold
            q = [q]
            q, _ = matutils.qr_destroy(q)  # orthonormalize the range

    qt = q[:, :samples].T.copy()
    del q

    if scipy.sparse.issparse(corpus):
        b = qt * corpus
        logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape))
        u, s, vt = numpy.linalg.svd(b, full_matrices=False)
        del b, vt
    else:
        # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A
        # again, construct X incrementally, in chunks of `chunksize` documents from the streaming
        # input corpus A, to avoid using O(number of documents) memory
        x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64)
        logger.info("2nd phase: constructing %s covariance matrix" %
                    str(x.shape))
        for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
            logger.info('PROGRESS: at document #%i/%i' %
                        (chunk_no * chunksize, num_docs))
            chunk = matutils.corpus2csc(chunk,
                                        num_terms=num_terms,
                                        dtype=qt.dtype)
            b = qt * chunk  # dense * sparse matrix multiply
            del chunk
            x += numpy.dot(
                b, b.T
            )  # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :(
            del b

        # now we're ready to compute decomposition of the small matrix X
        logger.info("running dense decomposition on %s covariance matrix" %
                    str(x.shape))
        u, s, vt = numpy.linalg.svd(
            x
        )  # could use linalg.eigh, but who cares... and svd returns the factors already sorted :)
        s = numpy.sqrt(
            s
        )  # sqrt to go back from singular values of X to singular values of B = singular values of the corpus
    q = qt.T.copy()
    del qt

    logger.info("computing the final decomposition")
    keep = clip_spectrum(s**2, rank, discard=eps)
    u = u[:, :keep].copy()
    s = s[:keep]
    u = numpy.dot(q, u)
    return u.astype(dtype), s.astype(dtype)
示例#38
0
    def add_documents(self, corpus, chunksize=None, decay=None):
        """
        Update singular value decomposition to take into account a new
        corpus of documents.

        Training proceeds in chunks of `chunksize` documents at a time. The size of
        `chunksize` is a tradeoff between increased speed (bigger `chunksize`)
        vs. lower memory footprint (smaller `chunksize`). If the distributed mode
        is on, each chunk is sent to a different worker/computer.

        Setting `decay` < 1.0 causes re-orientation towards new data trends in the
        input document stream, by giving less emphasis to old observations. This allows
        LSA to gradually "forget" old observations (documents) and give more
        preference to new ones.
        """
        logger.info("updating model with new documents")

        # get computation parameters; if not specified, use the ones from constructor
        if chunksize is None:
            chunksize = self.chunksize
        if decay is None:
            decay = self.decay

        if not scipy.sparse.issparse(corpus):
            if not self.onepass:
                # we are allowed multiple passes over the input => use a faster, randomized two-pass algo
                update = Projection(self.num_terms, self.num_topics, None)
                update.u, update.s = stochastic_svd(corpus, self.num_topics,
                    num_terms=self.num_terms, chunksize=chunksize,
                    extra_dims=self.extra_samples, power_iters=self.power_iters)
                self.projection.merge(update, decay=decay)
            else:
                # the one-pass algo
                doc_no = 0

                ##### counters for jobs
                count_sent = 0   
                count_recv = 0
                
                for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)):
                    logger.info("preparing a new chunk of documents")
                    nnz = sum(len(doc) for doc in chunk)
                    # construct the job as a sparse matrix, to minimize memory overhead
                    # definitely avoid materializing it as a dense matrix!
                    logger.debug("converting corpus to csc format")
                    job = matutils.corpus2csc(chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz)
                    del chunk
                    doc_no += job.shape[1]
                    
                    ##### distributed version
                    if self.dispatcher:
                        
                        ##### store the comm size and prepare status
                        num_workers = self.comm.Get_size() - 1
                        status = MPI.Status()

                        ##### time to send some jobs
                        logger.debug("creating job #%i" % chunk_no)
                        count_sent += 1

                        ##### send the initial batch
                        if (chunk_no < num_workers):
                            self.comm.send(job, dest=chunk_no+1)
                        
                        ##### wait around for ready workers
                        else:
                            self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
                            source = status.Get_source()
                            count_recv += 1
                            self.comm.send(job, dest=source)

                        del job
                        logger.info("dispatched documents up to #%s" % doc_no)
                    else:
                        # serial version, there is only one "worker" (myself) => process the job directly
                        update = Projection(self.num_terms, self.num_topics, job)
                        del job
                        self.projection.merge(update, decay=decay)
                        del update
                        logger.info("processed documents up to #%s" % doc_no)
                        self.print_topics(5)

                ##### wait for all workers to finish (distributed version only)
                if self.dispatcher:
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")

                    ##### workers are finishing up
                    while (count_recv < count_sent):
                        self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
                        count_recv += 1

                    ##### placeholder for the result
                    result = None
                    result_recv = 0

                    ##### send the kill messages
                    for i in xrange(num_workers):
                        self.comm.send(None, dest=i+1)

                    ##### wait for all results
                    while (result_recv < num_workers):
                        r = self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status)
                        result_recv += 1
                        if result_recv == 1:
                            result = r
                        else:
                            result.merge(r)

                    logger.info("finished merging projections")
                    self.projection = result

#            logger.info("top topics after adding %i documents" % doc_no)
#            self.print_debug(10)
        else:
            assert not self.dispatcher, "must be in serial mode to receive jobs"
            assert self.onepass, "distributed two-pass algo not supported yet"
            update = Projection(self.num_terms, self.num_topics, corpus.tocsc())
            self.projection.merge(update, decay=decay)
            logger.info("processed sparse job of %i documents" % (corpus.shape[1]))
示例#39
0
    def update(self,
               corpus=None,
               author2doc=None,
               doc2author=None,
               chunksize=None,
               decay=None,
               offset=None,
               passes=None,
               update_every=None,
               eval_every=None,
               iterations=None,
               gamma_threshold=None,
               chunks_as_numpy=False):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached). `corpus` must be an iterable (repeatable stream of documents),

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>. Additionally, for smaller
        `corpus` sizes, an increasing `offset` may be beneficial (see
        Table 1 in Hoffman et al.)

        If update is called with authors that already exist in the model, it will
        resume training on not only new documents for that author, but also the
        previously seen documents. This is necessary for those authors' topic
        distributions to converge.

        Every time `update(corpus, author2doc)` is called, the new documents are
        to appended to all the previously seen documents, and author2doc is
        combined with the previously seen authors.

        To resume training on all the data seen by the model, simply call
        `update()`.

        It is not possible to add new authors to existing documents, as all
        documents in `corpus` are assumed to be new documents.

        Args:
            corpus (gensim corpus): The corpus with which the author-topic model should be updated.

            author2doc (dictionary): author to document mapping corresponding to indexes in input
                corpus.

            doc2author (dictionary): document to author mapping corresponding to indexes in input
                corpus.

            chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np
                array of not. np can in some settings turn the term IDs
                into floats, these will be converted back into integers in
                inference, which incurs a performance hit. For distributed
                computing it may be desirable to keep the chunks as np
                arrays.

        For other parameter settings, see :class:`AuthorTopicModel` constructor.

        """

        # use parameters given in constructor, unless user explicitly overrode them
        if decay is None:
            decay = self.decay
        if offset is None:
            offset = self.offset
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every
        if eval_every is None:
            eval_every = self.eval_every
        if iterations is None:
            iterations = self.iterations
        if gamma_threshold is None:
            gamma_threshold = self.gamma_threshold

        # TODO: if deepcopy is not used here, something goes wrong. When unit tests are run (specifically "testPasses"),
        # the process simply gets killed.
        author2doc = deepcopy(author2doc)
        doc2author = deepcopy(doc2author)

        # TODO: it is not possible to add new authors to an existing document (all input documents are treated
        # as completely new documents). Perhaps this functionality could be implemented.
        # If it's absolutely necessary, the user can delete the documents that have new authors, and call update
        # on them with the new and old authors.

        if corpus is None:
            # Just keep training on the already available data.
            # Assumes self.update() has been called before with input documents and corresponding authors.
            assert self.total_docs > 0, 'update() was called with no documents to train on.'
            train_corpus_idx = [d for d in xrange(self.total_docs)]
            num_input_authors = len(self.author2doc)
        else:
            if doc2author is None and author2doc is None:
                raise ValueError(
                    'at least one of author2doc/doc2author must be specified, to establish input space dimensionality'
                )

            # If either doc2author or author2doc is missing, construct them from the other.
            if doc2author is None:
                doc2author = construct_doc2author(corpus, author2doc)
            elif author2doc is None:
                author2doc = construct_author2doc(doc2author)

            # Number of authors that need to be updated.
            num_input_authors = len(author2doc)

            try:
                len_input_corpus = len(corpus)
            except TypeError:
                logger.warning(
                    "input corpus stream has no len(); counting documents")
                len_input_corpus = sum(1 for _ in corpus)
            if len_input_corpus == 0:
                logger.warning(
                    "AuthorTopicModel.update() called with an empty corpus")
                return

            self.total_docs += len_input_corpus

            # Add new documents in corpus to self.corpus.
            self.extend_corpus(corpus)

            # Obtain a list of new authors.
            new_authors = []
            # Sorting the author names makes the model more reproducible.
            for a in sorted(author2doc.keys()):
                if not self.author2doc.get(a):
                    new_authors.append(a)

            num_new_authors = len(new_authors)

            # Add new authors do author2id/id2author dictionaries.
            for a_id, a_name in enumerate(new_authors):
                self.author2id[a_name] = a_id + self.num_authors
                self.id2author[a_id + self.num_authors] = a_name

            # Increment the number of total authors seen.
            self.num_authors += num_new_authors

            # Initialize the variational distributions q(theta|gamma)
            gamma_new = self.random_state.gamma(
                100., 1. / 100., (num_new_authors, self.num_topics))
            self.state.gamma = np.vstack([self.state.gamma, gamma_new])

            # Combine author2doc with self.author2doc.
            # First, increment the document IDs by the number of previously seen documents.
            for a, doc_ids in author2doc.items():
                doc_ids = [
                    d + self.total_docs - len_input_corpus for d in doc_ids
                ]

            # For all authors in the input corpus, add the new documents.
            for a, doc_ids in author2doc.items():
                if self.author2doc.get(a):
                    # This is not a new author, append new documents.
                    self.author2doc[a].extend(doc_ids)
                else:
                    # This is a new author, create index.
                    self.author2doc[a] = doc_ids

            # Add all new documents to self.doc2author.
            for d, a_list in doc2author.items():
                self.doc2author[d] = a_list

            # Train on all documents of authors in input_corpus.
            train_corpus_idx = []
            for _ in author2doc.keys():  # For all authors in input corpus.
                for doc_ids in self.author2doc.values(
                ):  # For all documents in total corpus.
                    train_corpus_idx.extend(doc_ids)

            # Make the list of training documents unique.
            train_corpus_idx = list(set(train_corpus_idx))

        # train_corpus_idx is only a list of indexes, so "len" is valid.
        lencorpus = len(train_corpus_idx)

        if chunksize is None:
            chunksize = min(lencorpus, self.chunksize)

        self.state.numdocs += lencorpus

        if update_every:
            updatetype = "online"
            updateafter = min(lencorpus,
                              update_every * self.numworkers * chunksize)
        else:
            updatetype = "batch"
            updateafter = lencorpus
        evalafter = min(lencorpus,
                        (eval_every or 0) * self.numworkers * chunksize)

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info(
            "running %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once "
            "every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %f",
            updatetype, self.num_topics, num_input_authors, passes, lencorpus,
            updateafter, evalafter, iterations, gamma_threshold)

        if updates_per_pass * passes < 10:
            logger.warning(
                "too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy"
            )

        # rho is the "speed" of updating; TODO try other fncs
        # pass_ + num_updates handles increasing the starting t for each pass,
        # while allowing it to "reset" on the first pass of each update
        def rho():
            return pow(offset + pass_ + (self.num_updates / chunksize), -decay)

        for pass_ in xrange(passes):
            if self.dispatcher:
                logger.info('initializing %s workers', self.numworkers)
                self.dispatcher.reset(self.state)
            else:
                # gamma is not needed in "other", thus its shape is (0, 0).
                other = AuthorTopicState(self.eta, self.state.sstats.shape,
                                         (0, 0))
            dirty = False

            reallen = 0
            for chunk_no, chunk_doc_idx in enumerate(
                    utils.grouper(train_corpus_idx,
                                  chunksize,
                                  as_numpy=chunks_as_numpy)):
                chunk = [self.corpus[d] for d in chunk_doc_idx]
                reallen += len(
                    chunk
                )  # keep track of how many documents we've processed so far

                if eval_every and ((reallen == lencorpus) or
                                   ((chunk_no + 1) %
                                    (eval_every * self.numworkers) == 0)):
                    # log_perplexity requires the indexes of the documents being evaluated, to know what authors
                    # correspond to the documents.
                    self.log_perplexity(chunk,
                                        chunk_doc_idx,
                                        total_docs=lencorpus)

                if self.dispatcher:
                    # add the chunk to dispatcher's job queue, so workers can munch on it
                    logger.info(
                        "PROGRESS: pass %i, dispatching documents up to #%i/%i",
                        pass_, chunk_no * chunksize + len(chunk), lencorpus)
                    # this will eventually block until some jobs finish, because the queue has a small finite length
                    self.dispatcher.putjob(chunk)
                else:
                    logger.info("PROGRESS: pass %i, at document #%i/%i", pass_,
                                chunk_no * chunksize + len(chunk), lencorpus)
                    # do_estep requires the indexes of the documents being trained on, to know what authors
                    # correspond to the documents.
                    gammat = self.do_estep(chunk,
                                           self.author2doc, self.doc2author,
                                           rho(), other, chunk_doc_idx)

                    if self.optimize_alpha:
                        self.update_alpha(gammat, rho())

                dirty = True
                del chunk

                # perform an M step. determine when based on update_every, don't do this after every chunk
                if update_every and (chunk_no + 1) % (update_every *
                                                      self.numworkers) == 0:
                    if self.dispatcher:
                        # distributed mode: wait for all workers to finish
                        logger.info(
                            "reached the end of input; now waiting for all remaining jobs to finish"
                        )
                        other = self.dispatcher.getstate()
                    self.do_mstep(rho(), other, pass_ > 0)
                    del other  # frees up memory

                    if self.dispatcher:
                        logger.info('initializing workers')
                        self.dispatcher.reset(self.state)
                    else:
                        other = AuthorTopicState(self.eta,
                                                 self.state.sstats.shape,
                                                 (0, 0))
                    dirty = False
            # endfor single corpus iteration
            if reallen != lencorpus:
                raise RuntimeError(
                    "input corpus size changed during training (don't use generators as input)"
                )

            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info(
                        "reached the end of input; now waiting for all remaining jobs to finish"
                    )
                    other = self.dispatcher.getstate()
                self.do_mstep(rho(), other, pass_ > 0)
                del other
    def train(self, sentences, total_words=None, word_count=0, chunksize=100):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of utf8 strings.

        """
        if FAST_VERSION < 0:
            import warnings

            warnings.warn(
                "Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`"
            )
        logger.info(
            "training model with %i workers on %i vocabulary and %i features"
            % (self.workers, len(self.vocab), self.layer1_size)
        )

        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")

        start, next_report = time.time(), [1.0]
        word_count, total_words = [word_count], total_words or sum(v.count for v in self.vocab.itervalues())
        jobs = Queue(
            maxsize=2 * self.workers
        )  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock()  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = matutils.zeros_aligned(self.layer1_size, dtype=REAL)  # each thread must have its own work memory

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break
                # update the learning rate before every job
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                job_words = sum(train_sentence(self, sentence, alpha, work) for sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info(
                            "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s"
                            % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)
                        )
                        next_report[0] = (
                            elapsed + 1.0
                        )  # don't flood the log, wait at least a second between progress reports

        workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue
        no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences)
        for job_no, job in enumerate(utils.grouper(no_oov, chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(None)  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info(
            "training on %i words took %.1fs, %.0f words/s"
            % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)
        )

        return word_count[0]
    def train(self, input_file=None, total_words=None, word_count=0, chunksize=100 , alpha=0.025, alpha_doc=0.025,sentences_length=None):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        Each sentence must be a list of utf8 strings.

        """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`")
        logger.info("training model with %i workers on %i vocabulary and %i features, "
            "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s" %
            (self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative))

        if not self.vocab:
            raise RuntimeError("you must first build vocabulary before training the model")
        
        '''学習率を設定'''
        self.alpha_doc = float(alpha_doc)
        self.alpha = float(alpha)

        start, next_report = time.time(), [1.0]
        word_count = [word_count]
        total_words = total_words or int(sum(v.count * v.sample_probability for v in itervalues(self.vocab)))
        jobs = Queue(maxsize=2 * self.workers)  # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        lock = threading.Lock()  # for shared state (=number of words trained so far, log reports...)

        def worker_train():
            """Train the model, lifting lists of sentences from the jobs queue."""
            work = zeros(self.syn1_size, dtype=REAL)  # each thread must have its own work memory
            # neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            neu1 = zeros(self.syn1_size, dtype=REAL) 

            while True:
                job = jobs.get()
                if job is None:  # data finished, exit
                    break

                # update the learning rate before every job
                alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words))
                if self.alpha_flag == 1:
                    alpha = self.alpha
                # print "alpha", alpha
                # how many words did we train on? out-of-vocabulary (unknown) words do not count
                if self.sg:

                    if self.skip_gram_type == 0:
                        # job_id = 0
                        # sentence_id_,_ = job[job_id]
                        # print "py sentence_id = ",sentence_id_
                        # bf = deepcopy(self.doc[sentence_id_][0])
                        # print "bf : ",bf
                        job_words = sum(train_sentence_sg_simple(self, sentence_id,sentence, alpha, work,self.alpha_doc) for sentence_id,sentence in job)
                        # print "af : ",self.doc[sentence_id_][0]
                        # print "re : ", self.doc[sentence_id_][0] - bf
                    elif self.skip_gram_type == 1:

                        # ids_back = [sentence_id for sentence_id,_ in job]
                        # bf_ = deepcopy(self.doc[ids_back])


                        job_words = sum(train_sentence_sg_average(self, sentence_id,sentence, alpha, work, neu1 ,self.alpha_doc) for sentence_id,sentence in job)

                        # af_ = self.doc[ids_back]
                        # print numpy.mean(af_ - bf_ )

                    elif self.skip_gram_type == 2:

                        ids_back = [sentence_id for sentence_id,_ in job]
                        bf_ = deepcopy(self.doc[ids_back])

                        job_words = sum(train_sentence_sg_concat(self, sentence_id,sentence, alpha, work,neu1,self.alpha_doc) for sentence_id,sentence in job)

                        af_ = self.doc[ids_back]
                        print numpy.mean(af_ - bf_ )

                elif self.cbow_type == 4:

                    ids_back = [sentence_id for sentence_id,_ in job]
                    bf_ = deepcopy(self.doc[ids_back])

                    # job_words = sum(train_sentence_cbow_average_plus_doc_vec_extra_train(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job)
                    job_words = sum(train_sentence_cbow_average_plus_doc(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job)


                    af_ = self.doc[ids_back]
                    print numpy.mean(af_ - bf_ )

                    # print "re : ", af_ - bf_
                # elif self.cbow_type == 5:
                #     job_words = sum(train_sentence_cbow_concatenate_v2(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job)
                elif self.cbow_type == 3:
                    job_id = 0
                    ids_back = [sentence_id for sentence_id,_ in job]
                    bf_ = deepcopy(self.doc[ids_back])
                    sentence_id_,sentence_ = job[job_id]
                    # # print "py sentence_id = ",sentence_id_
                    # bf = deepcopy(self.doc[sentence_id_])
                    # print "bf : ",bf
                    # print "null_vec", self.null_vec
                    job_words = sum(train_sentence_cbow_concatenate(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job)

                    # af_ = self.doc[ids_back]
                    # print numpy.mean(af_ - bf_ )
                    # print "af : ",self.doc[sentence_id_]
                    # print "re : ", self.doc[sentence_id_] - bf


                    # print sum(self.doc[sentence_id_] - bf)
                elif self.cbow_type == 2:
                    job_words = sum(train_sentence_cbow_concatenate_syn1_doc(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job)
                elif self.cbow_type == 1:

                    ids_back = [sentence_id for sentence_id,_ in job]
                    bf_ = deepcopy(self.doc[ids_back])
                    job_words = sum(train_sentence_cbow_average_simple(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job)


                    af_ = self.doc[ids_back]
                    print numpy.mean(af_ - bf_ )
                    # print af_ - bf_ 


                elif self.cbow_type == 0:
                    job_words = sum(train_sentence_cbow_syn1_doc(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job)
                with lock:
                    word_count[0] += job_words
                    elapsed = time.time() - start
                    if elapsed >= next_report[0]:
                        logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" %
                            (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0))
                        next_report[0] = elapsed + 1.0  # don't flood the log, wait at least a second between progress reports

        workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        def prepare_sentences():
            '''
                ここでsentencesのindexをradom.shuffleしランダムに学習していく
            '''
            if self.random_learn_flag:
                # ランダムに入力データをシャッフルして学習する
                indexes_sentence_ids = numpy.array(range(sentences_length))
                random.shuffle(indexes_sentence_ids, lambda: random_seed)
                sentences = [(indexes_sentence_ids[index],sentence) for index,sentence in enumerate(open(input_file))]

            else:
                sentences = enumerate(open(input_file))

            for sentence_id, sentence in sentences:
                sentence = sentence.split(u" ")
                # 途中まで学習している場合はスキップする(学習済モデルから追加で学習する場合)
                if sentence_id < self.skip_id:
                    print "skip! :"+str(sentence_id) +" "+str(self.skip_id)
                    continue
                sampled = [self.vocab[word] for word in sentence
                    if word in self.vocab and (self.vocab[word].sample_probability >= 1.0 or self.vocab[word].sample_probability >= numpy.random.random_sample())]
                yield (sentence_id,sampled)

        # no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences)
        # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue
        for job_no, job in enumerate(utils.grouper(prepare_sentences(), chunksize)):
            logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize()))
            jobs.put(job)
        logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize())
        for _ in xrange(self.workers):
            jobs.put(None)  # give the workers heads up that they can finish -- no more work!

        for thread in workers:
            thread.join()

        elapsed = time.time() - start
        logger.info("training on %i words took %.1fs, %.0f words/s" %
            (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0))

        return word_count[0]