def serialize(filename_prefix, layer, current_representation, num_terms=None, chunksize=10000): is_corpus, current_representation = utils.is_corpus(current_representation) if is_corpus: for chunk_no, chunk in enumerate(utils.grouper(current_representation, chunksize)): ln.debug("preparing chunk for conversion (%s documents)..." % len(chunk)) assert num_terms is not None, "Need num_terms to properly handle sparse corpus format" chunk_as_csc = matutils.corpus2csc(chunk, num_terms=num_terms) ln.debug("Chunk converted to csc, running through layer..") chunk_trans = layer.__getitem__(chunk_as_csc) ln.debug("Serializing hidden representation..") fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no)) np.save(fname, chunk_trans) ln.debug("Finished serializing chunk. Processed %s documents so far." % (chunk_no * chunksize + len(chunk))) else: ln.info("Beginning serialization of non-gensim corpus format intermediate representation.") ln.debug("Type of current_representation is %s" % type(current_representation)) for chunk_no, chunk in enumerate(current_representation): ln.debug("converting chunk (%s documents)..." % chunksize) chunk_trans = layer.__getitem__(chunk) ln.debug("Serializing hidden representation..") fname = "%s_%s" % (filename_prefix, ("0" * (15 - len(str(chunk_no)))) + str(chunk_no)) np.save(fname, chunk_trans) ln.debug("finished serializing chunk.") ln.info("Finished serializing all chunks.")
def update(self, corpus): save_freq = max(1, int(10000 / self.chunksize)) # save every 10k docs, roughly chunks_processed = 0 start_time = time.clock() while True: for chunk in utils.grouper(corpus, self.chunksize): self.update_chunk(chunk) self.m_num_docs_processed += len(chunk) chunks_processed += 1 if self.update_finished(start_time, chunks_processed, self.m_num_docs_processed): self.update_expectations() alpha, beta = self.hdp_to_lda() self.lda_alpha = alpha self.lda_beta = beta self.print_topics(20) if self.outputdir: self.save_topics() return elif chunks_processed % save_freq == 0: self.update_expectations() # self.save_topics(self.m_num_docs_processed) self.print_topics(20) logger.info('PROGRESS: finished document %i of %i', self.m_num_docs_processed, self.m_D)
def __iter__(self): if self.chunksize: for chunk in utils.grouper(self.corpus, self.chunksize): for transformed in self.obj.__getitem__(chunk, chunksize=None): yield transformed else: for doc in self.corpus: yield self.obj[doc]
def transformed_corpus(): for chunk_no, doc_chunk in enumerate(utils.grouper(bow, chunksize)): ln.debug("Converting chunk %s to csc format.." % chunk_no) chunk = matutils.corpus2csc(doc_chunk, self.input_dimensionality) ln.debug("Computing hidden representation for chunk.. ") hidden = self._get_hidden_representations(chunk) ln.info("Finished computing representation for chunk %s, yielding results. Total docs processed: %s" % (chunk_no, chunk_no * chunksize + len(doc_chunk))) for column in hidden.T: yield matutils.dense2vec(column.T) ln.debug("Done yielding chunk %s" % chunk_no) ln.info("Finished computing representations for all chunks.")
def inferDTMseq(self, corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter, chunksize): """ Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. Need to pass the LdaSeq model, corpus, sufficient stats, gammas and lhoods matrices previously created, and LdaModel and LdaPost class objects. """ doc_index = 0 # overall doc_index in corpus time = 0 # current time-slice doc_num = 0 # doc-index in current time-slice lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice time_slice = np.cumsum(np.array(self.time_slice)) for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): # iterates chunk size for constant memory footprint for doc in chunk: # this is used to update the time_slice and create a new lda_seq slice every new time_slice if doc_index > time_slice[time]: time += 1 lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice doc_num = 0 gam = gammas[doc_index] lhood = lhoods[doc_index] ldapost.gamma = gam ldapost.lhood = lhood ldapost.doc = doc # TODO: replace fit_lda_post with appropriate ldamodel functions, if possible. if iter_ == 0: doc_lhood = LdaPost.fit_lda_post( ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter ) else: doc_lhood = LdaPost.fit_lda_post( ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter ) if topic_suffstats is not None: topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, doc, topic_suffstats) gammas[doc_index] = ldapost.gamma bound += doc_lhood doc_index += 1 doc_num += 1 return bound, gammas
def __iter__(self): for chunk_no, chunk in enumerate(utils.grouper(self.corpus, self.chunksize)): nnz = sum(len(doc) for doc in chunk) # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! # ln.debug("converting corpus to csc format") if self.dense: job = matutils.corpus2dense(chunk, num_docs=len(chunk), num_terms=self.num_terms) else: job = matutils.corpus2csc(chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz) if self.filter_dimensions is not None: filtered = job[self.filter_dimensions, :] else: filtered = None yield job, filtered del chunk
def __iter__(self): """Iterate over the corpus. If `chunksize` is set, works in "batch-manner" (more efficient). Yields ------ list of (int, number) Document in BoW format """ if self.chunksize: for chunk in utils.grouper(self.corpus, self.chunksize): for transformed in self.obj.__getitem__(chunk, chunksize=None): yield transformed else: for doc in self.corpus: yield self.obj[doc]
def __iter__(self): """Iterate over the corpus, applying the selected transformation. If `chunksize` was set in the constructor, works in "batch-manner" (more efficient). Yields ------ list of (int, number) Documents in the sparse Gensim bag-of-words format. """ if self.chunksize: for chunk in utils.grouper(self.corpus, self.chunksize): for transformed in self.obj.__getitem__(chunk, chunksize=None): yield transformed else: for doc in self.corpus: yield self.obj[doc]
def __init__(self, noise, lambda_, input_dimensionality, output_dimensionality=None, prototype_ids=None): self.noise = noise self.lambda_ = lambda_ self.input_dimensionality = input_dimensionality self.output_dimensionality = output_dimensionality or input_dimensionality if self.output_dimensionality != self.input_dimensionality: if prototype_ids is None: ln.warn("Need prototype IDs to train reduction layer.") self.randomized_indices = list(utils.grouper(np.random.permutation(self.input_dimensionality), self.output_dimensionality)) for idx_batch in self.randomized_indices: idx_batch.sort() # should be more efficient when selecting array rows in order later on self.prototype_ids = prototype_ids self.num_folds = int(np.ceil(float(self.input_dimensionality) / self.output_dimensionality)) self.blocks = []
def update(self, corpus): """Train the model with new documents, by EM-iterating over `corpus` until any of the conditions is satisfied. * time limit expired * chunk limit reached * whole corpus processed Parameters ---------- corpus : iterable of list of (int, float) Corpus in BoW format. """ save_freq = max(1, int(10000 / self.chunksize)) # save every 10k docs, roughly chunks_processed = 0 start_time = time.clock() while True: for chunk in utils.grouper(corpus, self.chunksize): self.update_chunk(chunk) self.m_num_docs_processed += len(chunk) chunks_processed += 1 if self.update_finished(start_time, chunks_processed, self.m_num_docs_processed): self.update_expectations() alpha, beta = self.hdp_to_lda() self.lda_alpha = alpha self.lda_beta = beta self.print_topics(20) if self.outputdir: self.save_topics() return elif chunks_processed % save_freq == 0: self.update_expectations() # self.save_topics(self.m_num_docs_processed) self.print_topics(20) logger.info('PROGRESS: finished document %i of %i', self.m_num_docs_processed, self.m_D)
def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). In distributed mode, the E step is distributed over a cluster of machines. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. """ # use parameters given in constructor, unless user explicitly overrode them if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if passes is None: passes = self.passes if update_every is None: update_every = self.update_every # rho is the "speed" of updating; TODO try other fncs rho = lambda: pow(1.0 + self.num_updates, -decay) try: lencorpus = len(corpus) except: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") return self.state.numdocs += lencorpus if update_every > 0: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunksize) else: updatetype = "batch" updateafter = lencorpus updates_per_pass = max(1, lencorpus / updateafter) logger.info("running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documents, updating model once " "every %i documents" % (updatetype, self.num_topics, passes, lencorpus, updateafter)) if updates_per_pass * passes < 10: logger.warning("too few updates, training might not converge; consider " "increasing the number of passes to improve accuracy") for iteration in xrange(passes): ##### reset all workers if self.dispatcher: status = MPI.Status() ##### send reset message, with current state; ensure all ready for i in xrange(self.numworkers): self.comm.sendrecv(self.state, dest=i+1, sendtag=RESET, source=i+1) logger.info('initializing %s workers' % self.numworkers) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False ##### counters for jobs count_sent = 0 count_recv = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=True)): if self.dispatcher: ##### send some work logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' % (iteration, chunk_no * chunksize + len(chunk), lencorpus)) count_sent += 1 status = MPI.Status() ##### send the initial batch if (chunk_no < self.numworkers): self.comm.send(chunk, dest=chunk_no+1, tag=WORK) ##### send work if we just cleaned out the workers elif not dirty: self.comm.send(chunk, dest=(chunk_no % self.numworkers)+1, tag=WORK) ##### wait around for ready workers else: self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) source = status.Get_source() count_recv += 1 self.comm.send(chunk, dest=source, tag=WORK) else: logger.info('PROGRESS: iteration %i, at document #%i/%i' % (iteration, chunk_no * chunksize + len(chunk), lencorpus)) self.do_estep(chunk, other) dirty = True del chunk # perform an M step. determine when based on update_every, don't do this after every chunk if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: ##### wait for all workers to finish if self.dispatcher: logger.info("reached the end of input; now waiting for all remaining jobs to finish") ##### workers are finishing up while (count_recv < count_sent): self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) count_recv += 1 ##### placeholder for the result result = None result_recv = 0 ##### send the merge/clear messages for i in xrange(self.numworkers): self.comm.send(None, dest=i+1, tag=MERGE) ##### wait for all results while (result_recv < self.numworkers): r = self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) result_recv += 1 if result_recv == 1: result = r else: result.merge(r) other = result self.do_mstep(rho(), other) del other # free up some mem if self.dispatcher: logger.info('initializing workers') ##### send reset message, with current state for i in xrange(self.numworkers): self.comm.sendrecv(self.state, dest=i+1, sendtag=RESET, source=i+1) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False #endfor single corpus iteration if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") ##### workers are finishing up while (count_recv < count_sent): self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) count_recv += 1 ##### placeholder for the result result = None result_recv = 0 ##### send the merge/clear messages for i in xrange(self.numworkers): self.comm.send(None, dest=i+1, tag=MERGE) ##### wait for all results while (result_recv < self.numworkers): r = self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) result_recv += 1 if result_recv == 1: result = r else: result.merge(r) other = result self.do_mstep(rho(), other) del other dirty = False #endfor entire corpus update ##### kill the workers if self.dispatcher: for i in xrange(self.numworkers): self.comm.send(None, dest=i+1, tag=DIE) logger.info("workers are dead")
def train(self, sentences, total_words=None, word_count=0, chunksize=100, total_examples=None, queue_factor=2, report_delay=1): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For word2mat, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the sentences are the same as those that were used to initially build the vocabulary. """ if FAST_VERSION < 0: import warnings warnings.warn("C extension not loaded for word2mat, training will be slow. " "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] if self.negative > 0: # precompute negative labels optimization for pure-python training self.neg_labels = zeros(self.negative + 1) self.neg_labels[0] = 1. logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s and negative=%s", self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") if not hasattr(self, 'syn0'): raise RuntimeError("you must first finalize vocabulary before training the model") if total_words is None and total_examples is None: if self.corpus_count: total_examples = self.corpus_count logger.info("expecting %i examples, matching count from corpus used for vocabulary survey", total_examples) else: raise ValueError("you must provide either total_words or total_examples, to enable alpha and progress calculations") logging.info("initiallize sentence") sentences = EnumerateSentence(sentences) logging.info("initiallize sentence finish") if self.iter > 1: sentences = utils.RepeatCorpusNTimes(sentences, self.iter) total_words = total_words and total_words * self.iter total_examples = total_examples and total_examples * self.iter def worker_init(): work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) context_vector = matutils.zeros_aligned(self.topic_size,dtype=REAL) return (work, neu1,context_vector) def worker_one_job(job, inits): items, alpha = job if items is None: # signal to finish return False # train & return tally tally, raw_tally = self._do_train_job(items, alpha, inits) progress_queue.put((len(items), tally, raw_tally)) # report progress return True def worker_loop(): """Train the model, lifting lists of sentences from the jobs queue.""" init = worker_init() while True: job = job_queue.get() if not worker_one_job(job, init): break start, next_report = default_timer(), 1.0 # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( if self.workers > 0: job_queue = Queue(maxsize=queue_factor * self.workers) else: job_queue = FakeJobQueue(worker_init, worker_one_job) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() pushed_words = 0 pushed_examples = 0 example_count = 0 trained_word_count = 0 raw_word_count = word_count push_done = False done_jobs = 0 next_alpha = self.alpha jobs_source = enumerate(utils.grouper(sentences, chunksize)) # fill jobs queue with (sentence, alpha) job tuples while True: try: job_no, items = next(jobs_source) logger.debug("putting job #%i in the queue at alpha %.05f", job_no, next_alpha) job_queue.put((items, next_alpha)) # update the learning rate before every next job if self.min_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(items) next_alpha = self.alpha - (self.alpha - self.min_alpha) * (pushed_examples / total_examples) else: # words-based decay #pushed_words += self._raw_word_count(items) pushed_words += self._raw_word_count([item[1] for item in items]) next_alpha = self.alpha - (self.alpha - self.min_alpha) * (pushed_words / total_words) next_alpha = max(next_alpha, self.min_alpha) except StopIteration: logger.info( "reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) for _ in xrange(self.workers): job_queue.put((None, 0)) # give the workers heads up that they can finish -- no more work! push_done = True try: while done_jobs < (job_no+1) or not push_done: examples, trained_words, raw_words = progress_queue.get(push_done) # only block after all jobs pushed example_count += examples trained_word_count += trained_words # only words in vocab & sampled raw_word_count += raw_words done_jobs += 1 elapsed = default_timer() - start if elapsed >= next_report: if total_examples: # examples-based progress % logger.info( "PROGRESS: at %.2f%% examples, %.0f words/s", 100.0 * example_count / total_examples, trained_word_count / elapsed) else: # words-based progress % logger.info( "PROGRESS: at %.2f%% words, %.0f words/s", 100.0 * raw_word_count / total_words, trained_word_count / elapsed) next_report = elapsed + report_delay # don't flood log, wait report_delay seconds else: # loop ended by job count; really done break except Empty: pass # already out of loop; continue to next push elapsed = default_timer() - start logger.info( "training on %i raw words took %.1fs, %.0f trained words/s", raw_word_count, elapsed, trained_word_count / elapsed if elapsed else 0.0) if total_examples and total_examples != example_count: logger.warn("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_words and total_words != raw_word_count: logger.warn("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed self.clear_sims() return trained_word_count
def once_test(self, sentences, total_words=None, word_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. """ if FAST_VERSION < 0: import warnings warnings.warn( "Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`" ) logger.info( "training model with %i workers on %i vocabulary and %i features, " "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s" % (self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative)) if not self.vocab: raise RuntimeError( "you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count = [word_count] #total_words = total_words or int(sum(v.count * v.sample_probability for v in itervalues(self.vocab))) total_words = 0 for i in range(len(sentences)): total_words += len(sentences[i]) jobs = Queue( maxsize=2 * self.workers ) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock( ) # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros( self.layer1_size * self.window, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size * self.window, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max( self.min_alpha, self.alpha * (1 - 1.0 * (word_count[0] + self.now_iterated * total_words) / (total_words * self.iteration))) # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = sum( train_sentence_test(self, sentence[0], sentence[1], alpha, work, neu1) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: print "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % ( 100.0 * (word_count[0] + self.now_iterated * total_words) / (total_words * self.iteration), alpha, word_count[0] / elapsed if elapsed else 0.0) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [ threading.Thread(target=worker_train) for _ in xrange(self.workers) ] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() def prepare_sentences(): for number, sentence in enumerate(sentences): # avoid calling random_sample() where prob >= 1, to speed things up a little: sampled = [ self.vocab[word] for word in sentence if word in self.vocab and (self.vocab[word].sample_probability >= 1.0 or self. vocab[word].sample_probability >= random.random_sample()) ] sampled = (number, sampled) yield sampled # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue for job_no, job in enumerate( utils.grouper(prepare_sentences(), chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info( "reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put( None ) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) return word_count[0]
def train(self, texts, chunksize=100, workers=2): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ logger.info("training model with %i workers" % (workers)) start, next_report = time.time(), [1.0] jobs = Queue( maxsize=2 * workers ) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock( ) # for shared state (=number of words trained so far, log reports...) total_error = [0.0] objects_done = [0] def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" observation_work = np.zeros(self.window * self.size + self.object_size, dtype=REAL) prediction_work = np.zeros(self.output_size, dtype=REAL) composition_work = np.zeros( [ max(self.output_size, self.window * self.size + self.object_size), self.window * self.size + self.object_size ], dtype=REAL) if self.bilinear_form else None while True: job = jobs.get() if job is None: # data finished, exit break # how many words did we train on? out-of-vocabulary (unknown) words do not count error = sum( train_sentence_concatenation( self, sentence, object_index, softmax_target, sigmoid_target, self._alpha, prediction_work, observation_work, composition_work) for sentence, object_index, softmax_target, sigmoid_target in job) with lock: total_error[0] += error objects_done[0] += len(job) elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: %s objects, %.0f objects/s" % (objects_done[0], float(objects_done[0]) / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports dynos = [ threading.Thread(target=worker_train) for _ in range(0, workers) ] for thread in dynos: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue no_oov = ((np.array([self.vocab.get_index(word) for word in sentence], dtype=INT), object_index, softmax_target, sigmoid_target) for sentence, object_index, softmax_target, sigmoid_target in texts) for job_no, job in enumerate(gensim_utils.grouper(no_oov, chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info( "reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in range(0, workers): jobs.put( None ) # give the workers heads up that they can finish -- no more work! for thread in dynos: thread.join() elapsed = time.time() - start logger.info("training on %i objects took %.1fs, %.0f words/s" % (objects_done[0], elapsed, objects_done[0] / elapsed if elapsed else 0.0)) return (objects_done[0], total_error[0])
def train(self, sentences, total_words=None, word_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. """ if FAST_VERSION < 0: import warnings warnings.warn("C extension compilation failed, training will be slow. Install a C compiler and reinstall gensim for fast training.") logger.info("training model with %i workers on %i vocabulary and %i features, " "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s" % (self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative)) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count = [word_count] total_words = total_words or int(sum(v.count * v.sample_probability for v in itervalues(self.vocab)) * self.iter) jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros(self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = self._get_job_words(alpha, work, job, neu1) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue for job_no, job in enumerate(utils.grouper(self._prepare_sentences(sentences), chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) self.syn0norm = None return word_count[0]
def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). `corpus` must be an iterable (repeatable stream of documents), This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. Additionally, for smaller `corpus` sizes, an increasing `offset` may be beneficial (see Table 1 in Hoffman et al.) If update is called with authors that already exist in the model, it will resume training on not only new documents for that author, but also the previously seen documents. This is necessary for those authors' topic distributions to converge. Every time `update(corpus, author2doc)` is called, the new documents are to appended to all the previously seen documents, and author2doc is combined with the previously seen authors. To resume training on all the data seen by the model, simply call `update()`. It is not possible to add new authors to existing documents, as all documents in `corpus` are assumed to be new documents. Args: corpus (gensim corpus): The corpus with which the author-topic model should be updated. author2doc (dictionary): author to document mapping corresponding to indexes in input corpus. doc2author (dictionary): document to author mapping corresponding to indexes in input corpus. chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np array of not. np can in some settings turn the term IDs into floats, these will be converted back into integers in inference, which incurs a performance hit. For distributed computing it may be desirable to keep the chunks as np arrays. For other parameter settings, see :class:`AuthorTopicModel` constructor. """ # use parameters given in constructor, unless user explicitly overrode them if decay is None: decay = self.decay if offset is None: offset = self.offset if passes is None: passes = self.passes if update_every is None: update_every = self.update_every if eval_every is None: eval_every = self.eval_every if iterations is None: iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold # TODO: if deepcopy is not used here, something goes wrong. When unit tests are run (specifically "testPasses"), # the process simply gets killed. author2doc = deepcopy(author2doc) doc2author = deepcopy(doc2author) # TODO: it is not possible to add new authors to an existing document (all input documents are treated # as completely new documents). Perhaps this functionality could be implemented. # If it's absolutely necessary, the user can delete the documents that have new authors, and call update # on them with the new and old authors. if corpus is None: # Just keep training on the already available data. # Assumes self.update() has been called before with input documents and corresponding authors. assert self.total_docs > 0, 'update() was called with no documents to train on.' train_corpus_idx = [d for d in xrange(self.total_docs)] num_input_authors = len(self.author2doc) else: if doc2author is None and author2doc is None: raise ValueError('at least one of author2doc/doc2author must be specified, to establish input space dimensionality') # If either doc2author or author2doc is missing, construct them from the other. if doc2author is None: doc2author = construct_doc2author(corpus, author2doc) elif author2doc is None: author2doc = construct_author2doc(doc2author) # Number of authors that need to be updated. num_input_authors = len(author2doc) try: len_input_corpus = len(corpus) except TypeError: logger.warning("input corpus stream has no len(); counting documents") len_input_corpus = sum(1 for _ in corpus) if len_input_corpus == 0: logger.warning("AuthorTopicModel.update() called with an empty corpus") return self.total_docs += len_input_corpus # Add new documents in corpus to self.corpus. self.extend_corpus(corpus) # Obtain a list of new authors. new_authors = [] # Sorting the author names makes the model more reproducible. for a in sorted(author2doc.keys()): if not self.author2doc.get(a): new_authors.append(a) num_new_authors = len(new_authors) # Add new authors do author2id/id2author dictionaries. for a_id, a_name in enumerate(new_authors): self.author2id[a_name] = a_id + self.num_authors self.id2author[a_id + self.num_authors] = a_name # Increment the number of total authors seen. self.num_authors += num_new_authors # Initialize the variational distributions q(theta|gamma) gamma_new = self.random_state.gamma(100., 1. / 100., (num_new_authors, self.num_topics)) self.state.gamma = np.vstack([self.state.gamma, gamma_new]) # Combine author2doc with self.author2doc. # First, increment the document IDs by the number of previously seen documents. for a, doc_ids in author2doc.items(): doc_ids = [d + self.total_docs - len_input_corpus for d in doc_ids] # For all authors in the input corpus, add the new documents. for a, doc_ids in author2doc.items(): if self.author2doc.get(a): # This is not a new author, append new documents. self.author2doc[a].extend(doc_ids) else: # This is a new author, create index. self.author2doc[a] = doc_ids # Add all new documents to self.doc2author. for d, a_list in doc2author.items(): self.doc2author[d] = a_list # Train on all documents of authors in input_corpus. train_corpus_idx = [] for _ in author2doc.keys(): # For all authors in input corpus. for doc_ids in self.author2doc.values(): # For all documents in total corpus. train_corpus_idx.extend(doc_ids) # Make the list of training documents unique. train_corpus_idx = list(set(train_corpus_idx)) # train_corpus_idx is only a list of indexes, so "len" is valid. lencorpus = len(train_corpus_idx) if chunksize is None: chunksize = min(lencorpus, self.chunksize) self.state.numdocs += lencorpus if update_every: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunksize) else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) updates_per_pass = max(1, lencorpus / updateafter) logger.info( "running %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once " "every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold ) if updates_per_pass * passes < 10: logger.warning("too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy") # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, # while allowing it to "reset" on the first pass of each update def rho(): return pow(offset + pass_ + (self.num_updates / chunksize), -decay) for pass_ in xrange(passes): if self.dispatcher: logger.info('initializing %s workers', self.numworkers) self.dispatcher.reset(self.state) else: # gamma is not needed in "other", thus its shape is (0, 0). other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0)) dirty = False reallen = 0 for chunk_no, chunk_doc_idx in enumerate(utils.grouper(train_corpus_idx, chunksize, as_numpy=chunks_as_numpy)): chunk = [self.corpus[d] for d in chunk_doc_idx] reallen += len(chunk) # keep track of how many documents we've processed so far if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): # log_perplexity requires the indexes of the documents being evaluated, to know what authors # correspond to the documents. self.log_perplexity(chunk, chunk_doc_idx, total_docs=lencorpus) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info( "PROGRESS: pass %i, dispatching documents up to #%i/%i", pass_, chunk_no * chunksize + len(chunk), lencorpus ) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: logger.info( "PROGRESS: pass %i, at document #%i/%i", pass_, chunk_no * chunksize + len(chunk), lencorpus ) # do_estep requires the indexes of the documents being trained on, to know what authors # correspond to the documents. gammat = self.do_estep(chunk, self.author2doc, self.doc2author, rho(), other, chunk_doc_idx) if self.optimize_alpha: self.update_alpha(gammat, rho()) dirty = True del chunk # perform an M step. determine when based on update_every, don't do this after every chunk if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other, pass_ > 0) del other # frees up memory if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0)) dirty = False # endfor single corpus iteration if reallen != lencorpus: raise RuntimeError("input corpus size changed during training (don't use generators as input)") if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other, pass_ > 0) del other
def update(self, corpus, chunks_as_numpy=False): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). `corpus` must be an iterable (repeatable stream of documents), The E-step is distributed into the several processes. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. """ try: lencorpus = len(corpus) except: logger.warning( "input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaMulticore.update() called with an empty corpus") return self.state.numdocs += lencorpus if not self.batch: updatetype = "online" updateafter = self.chunksize * self.workers else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (self.eval_every or 0) * updateafter) updates_per_pass = max(1, lencorpus / updateafter) logger.info( "running %s LDA training, %s topics, %i passes over the" " supplied corpus of %i documents, updating every %i documents," " evaluating every ~%i documents, iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter, self.iterations, self.gamma_threshold) if updates_per_pass * self.passes < 10: logger.warning( "too few updates, training might not converge; consider " "increasing the number of passes or iterations to improve accuracy" ) job_queue = Queue(maxsize=2 * self.workers) result_queue = Queue() # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, # while allowing it to "reset" on the first pass of each update def rho(): return pow( self.offset + pass_ + (self.num_updates / self.chunksize), -self.decay) logger.info("training LDA model using %i processes", self.workers) pool = Pool(self.workers, worker_e_step, ( job_queue, result_queue, )) for pass_ in xrange(self.passes): queue_size, reallen = [0], 0 other = LdaState(self.eta, self.state.sstats.shape) def process_result_queue(force=False): """ Clear the result queue, merging all intermediate results, and update the LDA model if necessary. """ merged_new = False while not result_queue.empty(): other.merge(result_queue.get()) queue_size[0] -= 1 merged_new = True if (force and merged_new and queue_size[0] == 0) or ( not self.batch and (other.numdocs >= updateafter)): self.do_mstep(rho(), other, pass_ > 0) other.reset() if self.eval_every is not None and ( (force and queue_size[0] == 0) or (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)): self.log_perplexity(chunk, total_docs=lencorpus) chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=chunks_as_numpy) for chunk_no, chunk in enumerate(chunk_stream): reallen += len( chunk ) # keep track of how many documents we've processed so far # put the chunk into the workers' input job queue chunk_put = False while not chunk_put: try: job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1) chunk_put = True queue_size[0] += 1 logger.info( 'PROGRESS: pass %i, dispatched chunk #%i = ' 'documents up to #%i/%i, outstanding queue size %i', pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0]) except queue.Full: # in case the input job queue is full, keep clearing the # result queue, to make sure we don't deadlock process_result_queue() process_result_queue() #endfor single corpus pass # wait for all outstanding jobs to finish while queue_size[0] > 0: process_result_queue(force=True) if reallen != lencorpus: raise RuntimeError( "input corpus size changed during training (don't use generators as input)" ) #endfor entire update pool.terminate()
def train(self, input_file=None, total_words=None, word_count=0, chunksize=100, alpha=0.025, alpha_doc=0.025, sentences_length=None): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ if FAST_VERSION < 0: import warnings warnings.warn( "Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`" ) logger.info( "training model with %i workers on %i vocabulary and %i features, " "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s" % (self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative)) if not self.vocab: raise RuntimeError( "you must first build vocabulary before training the model") '''学習率を設定''' self.alpha_doc = float(alpha_doc) self.alpha = float(alpha) start, next_report = time.time(), [1.0] word_count = [word_count] total_words = total_words or int( sum(v.count * v.sample_probability for v in itervalues(self.vocab))) jobs = Queue( maxsize=2 * self.workers ) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock( ) # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros( self.syn1_size, dtype=REAL) # each thread must have its own work memory # neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) neu1 = zeros(self.syn1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max( self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.alpha_flag == 1: alpha = self.alpha # print "alpha", alpha # how many words did we train on? out-of-vocabulary (unknown) words do not count if self.sg: if self.skip_gram_type == 0: # job_id = 0 # sentence_id_,_ = job[job_id] # print "py sentence_id = ",sentence_id_ # bf = deepcopy(self.doc[sentence_id_][0]) # print "bf : ",bf job_words = sum( train_sentence_sg_simple(self, sentence_id, sentence, alpha, work, self.alpha_doc) for sentence_id, sentence in job) # print "af : ",self.doc[sentence_id_][0] # print "re : ", self.doc[sentence_id_][0] - bf elif self.skip_gram_type == 1: # ids_back = [sentence_id for sentence_id,_ in job] # bf_ = deepcopy(self.doc[ids_back]) job_words = sum( train_sentence_sg_average( self, sentence_id, sentence, alpha, work, neu1, self.alpha_doc) for sentence_id, sentence in job) # af_ = self.doc[ids_back] # print numpy.mean(af_ - bf_ ) elif self.skip_gram_type == 2: ids_back = [sentence_id for sentence_id, _ in job] bf_ = deepcopy(self.doc[ids_back]) job_words = sum( train_sentence_sg_concat( self, sentence_id, sentence, alpha, work, neu1, self.alpha_doc) for sentence_id, sentence in job) af_ = self.doc[ids_back] print numpy.mean(af_ - bf_) elif self.cbow_type == 4: ids_back = [sentence_id for sentence_id, _ in job] bf_ = deepcopy(self.doc[ids_back]) # job_words = sum(train_sentence_cbow_average_plus_doc_vec_extra_train(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job) job_words = sum( train_sentence_cbow_average_plus_doc( self, sentence_id, sentence, alpha, work, neu1, self.alpha_doc) for sentence_id, sentence in job) af_ = self.doc[ids_back] print numpy.mean(af_ - bf_) # print "re : ", af_ - bf_ # elif self.cbow_type == 5: # job_words = sum(train_sentence_cbow_concatenate_v2(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job) elif self.cbow_type == 3: job_id = 0 ids_back = [sentence_id for sentence_id, _ in job] bf_ = deepcopy(self.doc[ids_back]) sentence_id_, sentence_ = job[job_id] # # print "py sentence_id = ",sentence_id_ # bf = deepcopy(self.doc[sentence_id_]) # print "bf : ",bf # print "null_vec", self.null_vec job_words = sum( train_sentence_cbow_concatenate( self, sentence_id, sentence, alpha, work, neu1, self.alpha_doc) for sentence_id, sentence in job) # af_ = self.doc[ids_back] # print numpy.mean(af_ - bf_ ) # print "af : ",self.doc[sentence_id_] # print "re : ", self.doc[sentence_id_] - bf # print sum(self.doc[sentence_id_] - bf) elif self.cbow_type == 2: job_words = sum( train_sentence_cbow_concatenate_syn1_doc( self, sentence_id, sentence, alpha, work, neu1, self.alpha_doc) for sentence_id, sentence in job) elif self.cbow_type == 1: ids_back = [sentence_id for sentence_id, _ in job] bf_ = deepcopy(self.doc[ids_back]) job_words = sum( train_sentence_cbow_average_simple( self, sentence_id, sentence, alpha, work, neu1, self.alpha_doc) for sentence_id, sentence in job) af_ = self.doc[ids_back] print numpy.mean(af_ - bf_) # print af_ - bf_ elif self.cbow_type == 0: job_words = sum( train_sentence_cbow_syn1_doc( self, sentence_id, sentence, alpha, work, neu1, self.alpha_doc) for sentence_id, sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info( "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [ threading.Thread(target=worker_train) for _ in xrange(self.workers) ] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() def prepare_sentences(): ''' ここでsentencesのindexをradom.shuffleしランダムに学習していく ''' if self.random_learn_flag: # ランダムに入力データをシャッフルして学習する indexes_sentence_ids = numpy.array(range(sentences_length)) random.shuffle(indexes_sentence_ids, lambda: random_seed) sentences = [(indexes_sentence_ids[index], sentence) for index, sentence in enumerate(open(input_file)) ] else: sentences = enumerate(open(input_file)) for sentence_id, sentence in sentences: sentence = sentence.split(u" ") # 途中まで学習している場合はスキップする(学習済モデルから追加で学習する場合) if sentence_id < self.skip_id: print "skip! :" + str(sentence_id) + " " + str( self.skip_id) continue sampled = [ self.vocab[word] for word in sentence if word in self.vocab and ( self.vocab[word].sample_probability >= 1.0 or self.vocab[word].sample_probability >= numpy.random.random_sample()) ] yield (sentence_id, sampled) # no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences) # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue for job_no, job in enumerate( utils.grouper(prepare_sentences(), chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info( "reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put( None ) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) return word_count[0]
def add_documents(self, corpus, chunksize=None, decay=None): """Update model with new `corpus`. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse.csc} Stream of document vectors or sparse matrix of shape (`num_terms`, num_documents). chunksize : int, optional Number of documents to be used in each training chunk, will use `self.chunksize` if not specified. decay : float, optional Weight of existing observations relatively to new ones, will use `self.decay` if not specified. Notes ----- Training proceeds in chunks of `chunksize` documents at a time. The size of `chunksize` is a tradeoff between increased speed (bigger `chunksize`) vs. lower memory footprint (smaller `chunksize`). If the distributed mode is on, each chunk is sent to a different worker/computer. """ logger.info("updating model with new documents") # get computation parameters; if not specified, use the ones from constructor if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo update = Projection(self.num_terms, self.num_topics, None, dtype=self.dtype) update.u, update.s = stochastic_svd( corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype) self.projection.merge(update, decay=decay) self.docs_processed += len(corpus) if hasattr( corpus, '__len__') else 0 else: # the one-pass algo doc_no = 0 if self.dispatcher: logger.info('initializing %s workers', self.numworkers) self.dispatcher.reset() for chunk_no, chunk in enumerate( utils.grouper(corpus, chunksize)): logger.info("preparing a new chunk of documents") nnz = sum(len(doc) for doc in chunk) # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! logger.debug("converting corpus to csc format") job = matutils.corpus2csc(chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz, dtype=self.dtype) del chunk doc_no += job.shape[1] if self.dispatcher: # distributed version: add this job to the job queue, so workers can work on it logger.debug("creating job #%i", chunk_no) # put job into queue; this will eventually block, because the queue has a small finite size self.dispatcher.putjob(job) del job logger.info("dispatched documents up to #%s", doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly update = Projection(self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype) del job self.projection.merge(update, decay=decay) del update logger.info("processed documents up to #%s", doc_no) self.print_topics(5) # wait for all workers to finish (distributed version only) if self.dispatcher: logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) self.projection = self.dispatcher.getstate() self.docs_processed += doc_no else: assert not self.dispatcher, "must be in serial mode to receive jobs" update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents", corpus.shape[1]) self.docs_processed += corpus.shape[1]
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, power_iters=0, dtype=np.float64, eps=1e-6): """Run truncated Singular Value Decomposition (SVD) on a sparse input. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse} Input corpus as a stream (does not have to fit in RAM) or a sparse matrix of shape (`num_terms`, num_documents). rank : int Desired number of factors to be retained after decomposition. num_terms : int The number of features (terms) in `corpus`. chunksize : int, optional Number of documents to be used in each training chunk. extra_dims : int, optional Extra samples to be used besides the rank `k`. Can improve accuracy. power_iters: int, optional Number of power iteration steps to be used. Increasing the number of power iterations improves accuracy, but lowers performance. dtype : numpy.dtype, optional Enforces a type for elements of the decomposed matrix. eps: float, optional Percentage of the spectrum's energy to be discarded. Notes ----- The corpus may be larger than RAM (iterator of vectors), if `corpus` is a `scipy.sparse.csc` instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. This may return less than the requested number of top `rank` factors, in case the input itself is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2 + power_iters` passes over the input data. In case you can only afford a single pass, set `onepass=True` in :class:`~gensim.models.lsimodel.LsiModel` and avoid using this function directly. The decomposition algorithm is based on `"Finding structure with randomness: Probabilistic algorithms for constructing approximate matrix decompositions" <https://arxiv.org/abs/0909.4061>`_. Returns ------- (np.ndarray 2D, np.ndarray 1D) The left singular vectors and the singular values of the `corpus`. """ rank = int(rank) if extra_dims is None: samples = max( 10, 2 * rank ) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations", samples - rank, power_iters) num_terms = int(num_terms) # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunksize` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = np.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix", str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % ( m, num_terms) o = np.random.normal(0.0, 1.0, (n, samples)).astype( y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o # unlike np, scipy.sparse `astype()` copies everything, even if there is no change to dtype! # so check for equal dtype explicitly, to avoid the extra memory footprint if possible if y.dtype != dtype: y = y.astype(dtype) logger.info("orthonormalizing %s action matrix", str(y.shape)) y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range logger.debug("running %i power iterations", power_iters) for _ in range(power_iters): q = corpus.T * q q = [corpus * q] q, _ = matutils.qr_destroy( q) # orthonormalize the range after each power iteration step else: num_docs = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i', (chunk_no * chunksize)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunksize) matrix! s = sum(len(doc) for doc in chunk) chunk = matutils.corpus2csc( chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = np.random.normal(0.0, 1.0, (n, samples)).astype( dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o chunk.data, o.ravel(), y.ravel()) del chunk, o y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range for power_iter in range(power_iters): logger.info("running power iteration #%i", power_iter + 1) yold = q.copy() q[:] = 0.0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize, num_docs) # documents = columns of sparse CSC chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) tmp = chunk.T * yold tmp = chunk * tmp del chunk q += tmp del yold q = [q] q, _ = matutils.qr_destroy(q) # orthonormalize the range qt = q[:, :samples].T.copy() del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix", str(b.shape)) u, s, vt = scipy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunksize` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = np.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=dtype) logger.info("2nd phase: constructing %s covariance matrix", str(x.shape)) for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i', chunk_no * chunksize, num_docs) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype) b = qt * chunk # dense * sparse matrix multiply del chunk x += np.dot( b, b.T ) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix", str(x.shape)) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) u, s, vt = scipy.linalg.svd(x) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus s = np.sqrt(s) q = qt.T.copy() del qt logger.info("computing the final decomposition") keep = clip_spectrum(s**2, rank, discard=eps) u = u[:, :keep].copy() s = s[:keep] u = np.dot(q, u) return u.astype(dtype), s.astype(dtype)
def update(self, corpus, chunks_as_numpy=False): """Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). Train the model with new documents, by EM-iterating over the corpus until the topics converge, or until the maximum number of allowed iterations is reached. `corpus` must be an iterable. The E step is distributed into the several processes. Notes ----- This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse.csc}, optional Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`) used to update the model. chunks_as_numpy : bool Whether each chunk passed to the inference step should be a np.ndarray or not. Numpy can in some settings turn the term IDs into floats, these will be converted back into integers in inference, which incurs a performance hit. For distributed computing it may be desirable to keep the chunks as `numpy.ndarray`. """ try: lencorpus = len(corpus) except TypeError: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaMulticore.update() called with an empty corpus") return self.state.numdocs += lencorpus if not self.batch: updatetype = "online" updateafter = self.chunksize * self.workers else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (self.eval_every or 0) * updateafter) updates_per_pass = max(1, lencorpus / updateafter) logger.info( "running %s LDA training, %s topics, %i passes over the supplied corpus of %i documents, " "updating every %i documents, evaluating every ~%i documents, " "iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter, self.iterations, self.gamma_threshold ) if updates_per_pass * self.passes < 10: logger.warning( "too few updates, training might not converge; " "consider increasing the number of passes or iterations to improve accuracy" ) job_queue = Queue(maxsize=2 * self.workers) result_queue = Queue() # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, # while allowing it to "reset" on the first pass of each update def rho(): return pow(self.offset + pass_ + (self.num_updates / self.chunksize), -self.decay) logger.info("training LDA model using %i processes", self.workers) pool = Pool(self.workers, worker_e_step, (job_queue, result_queue,)) for pass_ in xrange(self.passes): queue_size, reallen = [0], 0 other = LdaState(self.eta, self.state.sstats.shape) def process_result_queue(force=False): """ Clear the result queue, merging all intermediate results, and update the LDA model if necessary. """ merged_new = False while not result_queue.empty(): other.merge(result_queue.get()) queue_size[0] -= 1 merged_new = True if (force and merged_new and queue_size[0] == 0) or (not self.batch and (other.numdocs >= updateafter)): self.do_mstep(rho(), other, pass_ > 0) other.reset() if self.eval_every is not None and \ ((force and queue_size[0] == 0) or (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)): self.log_perplexity(chunk, total_docs=lencorpus) chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=chunks_as_numpy) for chunk_no, chunk in enumerate(chunk_stream): reallen += len(chunk) # keep track of how many documents we've processed so far # put the chunk into the workers' input job queue chunk_put = False while not chunk_put: try: job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1) chunk_put = True queue_size[0] += 1 logger.info( "PROGRESS: pass %i, dispatched chunk #%i = documents up to #%i/%i, " "outstanding queue size %i", pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0] ) except queue.Full: # in case the input job queue is full, keep clearing the # result queue, to make sure we don't deadlock process_result_queue() process_result_queue() # endfor single corpus pass # wait for all outstanding jobs to finish while queue_size[0] > 0: process_result_queue(force=True) if reallen != lencorpus: raise RuntimeError("input corpus size changed during training (don't use generators as input)") # endfor entire update pool.terminate()
def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). `corpus` must be an iterable (repeatable stream of documents), In distributed mode, the E step is distributed over a cluster of machines. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. """ # use parameters given in constructor, unless user explicitly overrode them if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if passes is None: passes = self.passes if update_every is None: update_every = self.update_every if eval_every is None: eval_every = self.eval_every if iterations is None: iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold # rho is the "speed" of updating; TODO try other fncs rho = lambda: pow(1.0 + self.num_updates, -decay) try: lencorpus = len(corpus) except: logger.warning( "input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") return self.state.numdocs += lencorpus if update_every: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunksize) else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) updates_per_pass = max(1, lencorpus / updateafter) logger.info( "running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documents, updating model once " "every %i documents, evaluating perplexity every %i documents, " "iterating %ix with a convergence threshold of %f" % (updatetype, self.num_topics, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold)) if updates_per_pass * passes < 10: logger.warning( "too few updates, training might not converge; consider " "increasing the number of passes or iterations to improve accuracy" ) for pass_ in xrange(passes): if self.dispatcher: logger.info('initializing %s workers' % self.numworkers) self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False reallen = 0 for chunk_no, chunk in enumerate( utils.grouper(corpus, chunksize, as_numpy=True)): reallen += len( chunk ) # keep track of how many documents we've processed so far if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info( 'PROGRESS: pass %i, dispatching documents up to #%i/%i' % (pass_, chunk_no * chunksize + len(chunk), lencorpus)) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: logger.info( 'PROGRESS: pass %i, at document #%i/%i' % (pass_, chunk_no * chunksize + len(chunk), lencorpus)) gammat = self.do_estep(chunk, other) if self.optimize_alpha: self.update_alpha(gammat, rho) dirty = True del chunk # perform an M step. determine when based on update_every, don't do this after every chunk if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other # free up some mem if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False #endfor single corpus iteration if reallen != lencorpus: raise RuntimeError( "input corpus size changed during training (don't use generators as input)" ) if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other dirty = False
def update(self, corpus, chunksize=None, passes=None, eval_every=None): """Train the model with new documents. Parameters ---------- corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents) Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). chunksize: int, optional Number of documents to be used in each training chunk. passes: int, optional Number of full passes over the training corpus. Leave at default `passes=1` if your input is an iterator. eval_every: int, optional Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low. """ # use parameters given in constructor, unless user explicitly overrode them if passes is None: passes = self.passes if eval_every is None: eval_every = self.eval_every lencorpus = np.inf if isinstance(corpus, scipy.sparse.csc.csc_matrix): lencorpus = corpus.shape[1] else: try: lencorpus = len(corpus) except TypeError: logger.info("input corpus stream has no len()") if chunksize is None: chunksize = min(lencorpus, self.chunksize) evalafter = min(lencorpus, (eval_every or 0) * chunksize) if lencorpus == 0: logger.warning("Nmf.update() called with an empty corpus") return if isinstance(corpus, collections.Iterator) and self.passes > 1: raise ValueError( "Corpus is an iterator, only `passes=1` is valid.") logger.info( "running NMF training, %s topics, %i passes over the supplied corpus of %s documents, evaluating l2 norm " "every %i documents", self.num_topics, passes, lencorpus, evalafter, ) chunk_overall_idx = 1 for pass_ in range(passes): if isinstance(corpus, scipy.sparse.csc.csc_matrix): grouper = ( # Older scipy (0.19 etc) throw an error when slicing beyond the actual sparse array dimensions, so # we clip manually with min() here. corpus[:, col_idx:min(corpus.shape[1], col_idx + self.chunksize)] for col_idx in range(0, corpus.shape[1], self.chunksize)) else: grouper = utils.grouper(corpus, self.chunksize) for chunk_idx, chunk in enumerate(grouper): if isinstance(corpus, scipy.sparse.csc.csc_matrix): v = chunk[:, self.random_state.permutation(chunk.shape[1])] chunk_len = v.shape[1] else: self.random_state.shuffle(chunk) v = matutils.corpus2csc( chunk, num_terms=self.num_tokens, ) chunk_len = len(chunk) logger.info("PROGRESS: pass %i, at document #%i/%s", pass_, chunk_idx * chunksize + chunk_len, lencorpus) if self._W is None: # If `self._W` is not set (i.e. the first batch being handled), compute the initial matrix using the # batch mean. self._setup(v) self._h = self._solveproj(v, self._W, h=self._h, v_max=self.v_max) h = self._h if eval_every and (((chunk_idx + 1) * chunksize >= lencorpus) or (chunk_idx + 1) % eval_every == 0): logger.info("L2 norm: {}".format(self.l2_norm(v))) self.print_topics(5) self.A *= chunk_overall_idx - 1 self.A += h.dot(h.T) self.A /= chunk_overall_idx self.B *= chunk_overall_idx - 1 self.B += v.dot(h.T) self.B /= chunk_overall_idx previous_w_error = self._w_error self._solve_w() chunk_overall_idx += 1 logger.info("W error diff: {}".format( (self._w_error - previous_w_error)))
def train(self, sentences, total_words=None, word_count=0, chunksize=100, total_examples=None, queue_factor=2, report_delay=1): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For FastSent, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the sentences are the same as those that were used to initially build the vocabulary. """ if FAST_VERSION < 0: import warnings warnings.warn("C extension not loaded for FastSent, training will be slow. " "Install a C compiler and reinstall gensim for fast training.") self.neg_labels = [] logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sample=%s", self.workers, len(self.vocab), self.layer1_size, self.sample) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") if not hasattr(self, 'syn0'): raise RuntimeError("you must first finalize vocabulary before training the model") if total_words is None and total_examples is None: if self.corpus_count: total_examples = self.corpus_count logger.info("expecting %i examples, matching count from corpus used for vocabulary survey", total_examples) else: raise ValueError("you must provide either total_words or total_examples, to enable alpha and progress calculations") if self.iter > 1: sentences = utils.RepeatCorpusNTimes(sentences, self.iter) total_words = total_words and total_words * self.iter total_examples = total_examples and total_examples * self.iter def worker_init(): work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) return (work, neu1) def worker_one_job(job, inits): items, alpha = job if items is None: # signal to finish return False # train & return tally tally, raw_tally = self._do_train_job(items, alpha, inits) progress_queue.put((len(items), tally, raw_tally)) # report progress return True # loop of a given worker: fetches the data from the queue and then # launches the worker_one_job function def worker_loop(): """Train the model, lifting lists of sentences from the jobs queue.""" init = worker_init() while True: job = job_queue.get() if not worker_one_job(job, init): break start, next_report = default_timer(), 1.0 # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( if self.workers > 0: job_queue = Queue(maxsize=queue_factor * self.workers) else: job_queue = FakeJobQueue(worker_init, worker_one_job) progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) workers = [threading.Thread(target=worker_loop) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() pushed_words = 0 pushed_examples = 0 example_count = 0 trained_word_count = 0 raw_word_count = word_count push_done = False done_jobs = 0 next_alpha = self.alpha jobs_source = enumerate(utils.grouper(sentences, chunksize)) # fill jobs queue with (sentence, alpha) job tuples while True: try: job_no, items = next(jobs_source) logger.debug("putting job #%i in the queue at alpha %.05f", job_no, next_alpha) job_queue.put((items, next_alpha)) # update the learning rate before every next job if self.min_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(items) next_alpha = self.alpha - (self.alpha - self.min_alpha) * (pushed_examples / total_examples) else: # words-based decay pushed_words += self._raw_word_count(items) next_alpha = self.alpha - (self.alpha - self.min_alpha) * (pushed_words / total_words) next_alpha = max(next_alpha, self.min_alpha) except StopIteration: logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) for _ in xrange(self.workers): job_queue.put((None, 0)) # give the workers heads up that they can finish -- no more work! push_done = True try: while done_jobs < (job_no+1) or not push_done: examples, trained_words, raw_words = progress_queue.get(push_done) # only block after all jobs pushed example_count += examples trained_word_count += trained_words # only words in vocab & sampled raw_word_count += raw_words done_jobs += 1 elapsed = default_timer() - start if elapsed >= next_report: if total_examples: # examples-based progress % logger.info( "FASTSENT MODEL PROGRESS: at %.2f%% examples, %.0f words/s", 100.0 * example_count / total_examples, trained_word_count / elapsed) else: # words-based progress % logger.info( "FASTSENT MODEL PROGRESS: at %.2f%% words, %.0f words/s", 100.0 * raw_word_count / total_words, trained_word_count / elapsed) next_report = elapsed + report_delay # don't flood log, wait report_delay seconds else: # loop ended by job count; really done break except Empty: pass # already out of loop; continue to next push elapsed = default_timer() - start logger.info( "training on %i raw words took %.1fs, %.0f trained words/s", raw_word_count, elapsed, trained_word_count / elapsed if elapsed else 0.0) if total_examples and total_examples != example_count: logger.warn("supplied example count (%i) did not equal expected count (%i)", example_count, total_examples) if total_words and total_words != raw_word_count: logger.warn("supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words) self.train_count += 1 # number of times train() has been called self.total_train_time += elapsed self.clear_sims() return trained_word_count
def train(self, sentences, total_words=None, word_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ if FAST_VERSION < 0: import warnings warnings.warn( "Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`" ) logger.info( "training model with %i workers on %i vocabulary and %i features" % (self.workers, len(self.vocab), self.layer1_size)) if not self.vocab: raise RuntimeError( "you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count, total_words = [ word_count ], total_words or sum(v.count for v in itervalues(self.vocab)) jobs = Queue( maxsize=2 * self.workers ) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock( ) # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros( self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max( self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count if self.sg: job_words = sum( train_sentence_sg(self, sentence, alpha, work) for sentence in job) else: job_words = sum( train_sentence_cbow(self, sentence, alpha, work, neu1) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info( "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [ threading.Thread(target=worker_train) for _ in xrange(self.workers) ] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences) for job_no, job in enumerate(utils.grouper(no_oov, chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info( "reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put( None ) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) return word_count[0]
def update(self, corpus, chunksize=None, decay=None, passes=None, update_every=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). In distributed mode, the E step is distributed over a cluster of machines. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. """ # use parameters given in constructor, unless user explicitly overrode them if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if passes is None: passes = self.passes if update_every is None: update_every = self.update_every # rho is the "speed" of updating; TODO try other fncs rho = lambda: pow(1.0 + self.num_updates, -decay) try: lencorpus = len(corpus) except: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") return self.state.numdocs += lencorpus if update_every > 0: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunksize) else: updatetype = "batch" updateafter = lencorpus updates_per_pass = max(1, lencorpus / updateafter) logger.info("running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documents, updating model once " "every %i documents" % (updatetype, self.num_topics, passes, lencorpus, updateafter)) if updates_per_pass * passes < 10: logger.warning("too few updates, training might not converge; consider " "increasing the number of passes to improve accuracy") for iteration in xrange(passes): if self.dispatcher: logger.info('initializing %s workers' % self.numworkers) self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=True)): if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' % (iteration, chunk_no * chunksize + len(chunk), lencorpus)) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: logger.info('PROGRESS: iteration %i, at document #%i/%i' % (iteration, chunk_no * chunksize + len(chunk), lencorpus)) self.do_estep(chunk, other) dirty = True del chunk # perform an M step. determine when based on update_every, don't do this after every chunk if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other # free up some mem if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False #endfor single corpus iteration if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other dirty = False
def update(self, corpus, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). `corpus` must be an iterable (repeatable stream of documents), In distributed mode, the E step is distributed over a cluster of machines. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. Additionally, for smaller `corpus` sizes, an increasing `offset` may be beneficial (see Table 1 in Hoffman et al.) """ # use parameters given in constructor, unless user explicitly overrode them if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if offset is None: offset = self.offset if passes is None: passes = self.passes if update_every is None: update_every = self.update_every if eval_every is None: eval_every = self.eval_every if iterations is None: iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold # rho is the "speed" of updating; TODO try other fncs rho = lambda: pow(offset + self.num_updates / self.chunksize, -decay) try: lencorpus = len(corpus) except: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaModel.update() called with an empty corpus") return self.state.numdocs += lencorpus if update_every: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunksize) else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) updates_per_pass = max(1, lencorpus / updateafter) logger.info("running %s LDA training, %s topics, %i passes over " "the supplied corpus of %i documents, updating model once " "every %i documents, evaluating perplexity every %i documents, " "iterating %ix with a convergence threshold of %f" % (updatetype, self.num_topics, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold)) if updates_per_pass * passes < 10: logger.warning("too few updates, training might not converge; consider " "increasing the number of passes or iterations to improve accuracy") for pass_ in xrange(passes): if self.dispatcher: logger.info('initializing %s workers' % self.numworkers) self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False reallen = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize, as_numpy=True)): reallen += len(chunk) # keep track of how many documents we've processed so far if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): self.log_perplexity(chunk, total_docs=lencorpus) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info('PROGRESS: pass %i, dispatching documents up to #%i/%i' % (pass_, chunk_no * chunksize + len(chunk), lencorpus)) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: logger.info('PROGRESS: pass %i, at document #%i/%i' % (pass_, chunk_no * chunksize + len(chunk), lencorpus)) gammat = self.do_estep(chunk, other) if self.optimize_alpha: self.update_alpha(gammat, rho) dirty = True del chunk # perform an M step. determine when based on update_every, don't do this after every chunk if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other # free up some mem if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = LdaState(self.eta, self.state.sstats.shape) dirty = False #endfor single corpus iteration if reallen != lencorpus: raise RuntimeError("input corpus size changed during training (don't use generators as input)") if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info("reached the end of input; now waiting for all remaining jobs to finish") other = self.dispatcher.getstate() self.do_mstep(rho(), other) del other dirty = False
def update(self, corpus, chunksize=None, passes=None, eval_every=None): """Train the model with new documents. Parameters ---------- corpus : iterable of list of (int, float) or `csc_matrix` with the shape (n_tokens, n_documents) Training corpus. Can be either iterable of documents, which are lists of `(word_id, word_count)`, or a sparse csc matrix of BOWs for each document. If not specified, the model is left uninitialized (presumably, to be trained later with `self.train()`). chunksize: int, optional Number of documents to be used in each training chunk. passes: int, optional Number of full passes over the training corpus. Leave at default `passes=1` if your input is an iterator. eval_every: int, optional Number of batches after which l2 norm of (v - Wh) is computed. Decreases performance if set too low. """ # use parameters given in constructor, unless user explicitly overrode them if passes is None: passes = self.passes if eval_every is None: eval_every = self.eval_every lencorpus = np.inf if isinstance(corpus, scipy.sparse.csc.csc_matrix): lencorpus = corpus.shape[1] else: try: lencorpus = len(corpus) except TypeError: logger.info("input corpus stream has no len()") if chunksize is None: chunksize = min(lencorpus, self.chunksize) evalafter = min(lencorpus, (eval_every or 0) * chunksize) if lencorpus == 0: logger.warning("Nmf.update() called with an empty corpus") return if isinstance(corpus, collections.Iterator) and self.passes > 1: raise ValueError("Corpus is an iterator, only `passes=1` is valid.") logger.info( "running NMF training, %s topics, %i passes over the supplied corpus of %s documents, evaluating l2 norm " "every %i documents", self.num_topics, passes, lencorpus, evalafter, ) chunk_overall_idx = 1 for pass_ in range(passes): if isinstance(corpus, scipy.sparse.csc.csc_matrix): grouper = ( # Older scipy (0.19 etc) throw an error when slicing beyond the actual sparse array dimensions, so # we clip manually with min() here. corpus[:, col_idx:min(corpus.shape[1], col_idx + self.chunksize)] for col_idx in range(0, corpus.shape[1], self.chunksize) ) else: grouper = utils.grouper(corpus, self.chunksize) for chunk_idx, chunk in enumerate(grouper): if isinstance(corpus, scipy.sparse.csc.csc_matrix): v = chunk[:, self.random_state.permutation(chunk.shape[1])] chunk_len = v.shape[1] else: self.random_state.shuffle(chunk) v = matutils.corpus2csc( chunk, num_terms=self.num_tokens, ) chunk_len = len(chunk) logger.info( "PROGRESS: pass %i, at document #%i/%s", pass_, chunk_idx * chunksize + chunk_len, lencorpus ) if self._W is None: # If `self._W` is not set (i.e. the first batch being handled), compute the initial matrix using the # batch mean. self._setup(v) self._h = self._solveproj(v, self._W, h=self._h, v_max=self.v_max) h = self._h if eval_every and (((chunk_idx + 1) * chunksize >= lencorpus) or (chunk_idx + 1) % eval_every == 0): logger.info("L2 norm: {}".format(self.l2_norm(v))) self.print_topics(5) self.A *= chunk_overall_idx - 1 self.A += h.dot(h.T) self.A /= chunk_overall_idx self.B *= chunk_overall_idx - 1 self.B += v.dot(h.T) self.B /= chunk_overall_idx previous_w_error = self._w_error self._solve_w() chunk_overall_idx += 1 logger.info("W error diff: {}".format((self._w_error - previous_w_error)))
def add_documents(self, corpus, chunksize=None, decay=None): """ Update singular value decomposition to take into account a new corpus of documents. Training proceeds in chunks of `chunksize` documents at a time. The size of `chunksize` is a tradeoff between increased speed (bigger `chunksize`) vs. lower memory footprint (smaller `chunksize`). If the distributed mode is on, each chunk is sent to a different worker/computer. Setting `decay` < 1.0 causes re-orientation towards new data trends in the input document stream, by giving less emphasis to old observations. This allows LSA to gradually "forget" old observations (documents) and give more preference to new ones. """ logger.info("updating model with new documents") # get computation parameters; if not specified, use the ones from constructor if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo update = Projection(self.num_terms, self.num_topics, None) update.u, update.s = stochastic_svd(corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) else: # the one-pass algo doc_no = 0 if self.dispatcher: logger.info('initializing %s workers' % self.numworkers) self.dispatcher.reset() for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info("preparing a new chunk of documents") nnz = sum(len(doc) for doc in chunk) # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! logger.debug("converting corpus to csc format") job = matutils.corpus2csc(chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz) del chunk doc_no += job.shape[1] if self.dispatcher: # distributed version: add this job to the job queue, so workers can work on it logger.debug("creating job #%i" % chunk_no) self.dispatcher.putjob(job) # put job into queue; this will eventually block, because the queue has a small finite size del job logger.info("dispatched documents up to #%s" % doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly update = Projection(self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, power_iters=self.power_iters) del job self.projection.merge(update, decay=decay) del update logger.info("processed documents up to #%s" % doc_no) self.print_topics(5) # wait for all workers to finish (distributed version only) if self.dispatcher: logger.info("reached the end of input; now waiting for all remaining jobs to finish") self.projection = self.dispatcher.getstate() # logger.info("top topics after adding %i documents" % doc_no) # self.print_debug(10) else: assert not self.dispatcher, "must be in serial mode to receive jobs" assert self.onepass, "distributed two-pass algo not supported yet" update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents" % (corpus.shape[1]))
def train(self, instances, total_feats=None, feat_count=0, chunksize=100): """ Update the model's neural weights from a sequence of instances. Each instance must be a list of unicode strings or ints (indices). """ if FAST_VERSION < 0: import warnings warnings.warn("Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`") logger.info("training model with %i workers on %i vocabulary and %i embedding size" ", and 'negative sampling'=%s" % (self.workers, len(self.vocab), self.layer1_size, self.negative)) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] feat_count = [feat_count] total_feats = total_feats or int(sum(v.count for v in itervalues(self.vocab))) jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of instances from the jobs queue.""" ''' multiple working space ''' work = zeros(self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * feat_count[0] / total_feats)) # how many words did we train on? out-of-vocabulary (unknown) features do not count job_words = sum(train_instance(self, instance, alpha, work) for instance in job) with lock: feat_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% features, alpha %.05f, %.0f features/s" % (100.0 * feat_count[0] / total_feats, alpha, feat_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() def prepare_instances(): for instance in instances: sampled = [self.vocab[feat] for feat in instance if feat in self.vocab] yield sampled # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue for job_no, job in enumerate(utils.grouper(prepare_instances(), chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i features took %.1fs, %.0f features/s" % (feat_count[0], elapsed, feat_count[0] / elapsed if elapsed else 0.0)) return feat_count[0]
def update(self, corpus): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). `corpus` must be an iterable (repeatable stream of documents), The E-step is distributed into the several processes. This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. """ # rho is the "speed" of updating, decelerating over time rho = lambda: pow(1.0 + self.num_updates / self.chunksize, -self.decay) try: lencorpus = len(corpus) except: logger.warning("input corpus stream has no len(); counting documents") lencorpus = sum(1 for _ in corpus) if lencorpus == 0: logger.warning("LdaMulticore.update() called with an empty corpus") return self.state.numdocs += lencorpus if not self.batch: updatetype = "online" updateafter = self.chunksize * self.workers else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (self.eval_every * updateafter or 0)) updates_per_pass = max(1, lencorpus / updateafter) logger.info("running %s LDA training, %s topics, %i passes over the" " supplied corpus of %i documents, updating every %i documents," " evaluating every ~%i documents, iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, self.passes, lencorpus, updateafter, evalafter, self.iterations, self.gamma_threshold) if updates_per_pass * self.passes < 10: logger.warning("too few updates, training might not converge; consider " "increasing the number of passes or iterations to improve accuracy") def worker_e_step(input_queue, result_queue): """ Perform E-step for each (chunk_no, chunk, model) 3-tuple from the input queue, placing the resulting state into the result queue. """ logger.debug("worker process entering E-step loop") while True: logger.debug("getting a new job") chunk_no, chunk, worker_lda = input_queue.get() logger.debug("processing chunk #%i of %i documents", chunk_no, len(chunk)) worker_lda.state.reset() worker_lda.do_estep(chunk) # TODO: auto-tune alpha? del chunk logger.debug("processed chunk, queuing the result") result_queue.put(worker_lda.state) del worker_lda # free up some memory logger.debug("result put") job_queue = Queue(maxsize=2 * self.workers) result_queue = Queue() logger.info("training LDA model using %i processes", self.workers) pool = Pool(self.workers, worker_e_step, (job_queue, result_queue,)) for pass_ in xrange(self.passes): queue_size, reallen = [0], 0 other = LdaState(self.eta, self.state.sstats.shape) def process_result_queue(force=False): """ Clear the result queue, merging all intermediate results, and update the LDA model if necessary. """ merged_new = False while not result_queue.empty(): other.merge(result_queue.get()) queue_size[0] -= 1 merged_new = True if (force and merged_new and queue_size[0] == 0) or (not self.batch and (other.numdocs >= updateafter)): self.do_mstep(rho(), other) other.reset() if self.eval_every is not None and ((force and queue_size[0] == 0) or (self.eval_every != 0 and (self.num_updates / updateafter) % self.eval_every == 0)): self.log_perplexity(chunk, total_docs=lencorpus) chunk_stream = utils.grouper(corpus, self.chunksize, as_numpy=True) for chunk_no, chunk in enumerate(chunk_stream): reallen += len(chunk) # keep track of how many documents we've processed so far # put the chunk into the workers' input job queue chunk_put = False while not chunk_put: try: job_queue.put((chunk_no, chunk, self), block=False, timeout=0.1) chunk_put = True queue_size[0] += 1 logger.info('PROGRESS: pass %i, dispatched chunk #%i = ' 'documents up to #%i/%i, outstanding queue size %i', pass_, chunk_no, chunk_no * self.chunksize + len(chunk), lencorpus, queue_size[0]) except Full: # in case the input job queue is full, keep clearing the # result queue, to make sure we don't deadlock process_result_queue() process_result_queue() #endfor single corpus pass # wait for all outstanding jobs to finish while queue_size[0] > 0: process_result_queue(force=True) if reallen != lencorpus: raise RuntimeError("input corpus size changed during training (don't use generators as input)") #endfor entire update pool.close()
def add_documents(self, corpus, chunksize=None, decay=None): """Update model with new `corpus`. Parameters ---------- corpus : {iterable of list of (int, float), scipy.sparse.csc} Stream of document vectors or sparse matrix of shape (`num_terms`, num_documents). chunksize : int, optional Number of documents to be used in each training chunk, will use `self.chunksize` if not specified. decay : float, optional Weight of existing observations relatively to new ones, will use `self.decay` if not specified. Notes ----- Training proceeds in chunks of `chunksize` documents at a time. The size of `chunksize` is a tradeoff between increased speed (bigger `chunksize`) vs. lower memory footprint (smaller `chunksize`). If the distributed mode is on, each chunk is sent to a different worker/computer. """ logger.info("updating model with new documents") # get computation parameters; if not specified, use the ones from constructor if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo update = Projection(self.num_terms, self.num_topics, None, dtype=self.dtype) update.u, update.s = stochastic_svd( corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype ) self.projection.merge(update, decay=decay) self.docs_processed += len(corpus) if hasattr(corpus, '__len__') else 0 else: # the one-pass algo doc_no = 0 if self.dispatcher: logger.info('initializing %s workers', self.numworkers) self.dispatcher.reset() for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info("preparing a new chunk of documents") nnz = sum(len(doc) for doc in chunk) # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! logger.debug("converting corpus to csc format") job = matutils.corpus2csc( chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz, dtype=self.dtype) del chunk doc_no += job.shape[1] if self.dispatcher: # distributed version: add this job to the job queue, so workers can work on it logger.debug("creating job #%i", chunk_no) # put job into queue; this will eventually block, because the queue has a small finite size self.dispatcher.putjob(job) del job logger.info("dispatched documents up to #%s", doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly update = Projection( self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype ) del job self.projection.merge(update, decay=decay) del update logger.info("processed documents up to #%s", doc_no) self.print_topics(5) # wait for all workers to finish (distributed version only) if self.dispatcher: logger.info("reached the end of input; now waiting for all remaining jobs to finish") self.projection = self.dispatcher.getstate() self.docs_processed += doc_no else: assert not self.dispatcher, "must be in serial mode to receive jobs" update = Projection( self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters, dtype=self.dtype ) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents", corpus.shape[1]) self.docs_processed += corpus.shape[1]
parser.add_argument('-next', '--next', type=str, default='next_bookcorpus.shlf') parser.add_argument('-conj', '--conj', type=str, default='conj_bookcorpus.shlf') # parser.add_argument('-log_file', '--log_file', type=str, default='') args = parser.parse_args() sentences = MySentences(args.corpus) log_file = pjoin('process_' + get_time_str() + '.log') logging.basicConfig(filename=log_file, level=logging.DEBUG) chunksize = 100 groups = enumerate(utils.grouper(sentences, chunksize)) n_sentence = 0 order_ = [] next_ = [] conj_ = [] while True: try: sentence_no, items = next(groups) o, n, c = make_all_tasks(items) order_ += o next_ += n conj_ += c logging.info("%s \t %d %d %d %d", get_time_str(), sentence_no * chunksize, len(order_), len(next_), len(conj_))
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, power_iters=0, dtype=numpy.float64, eps=1e-6): """ Run truncated Singular Value Decomposition (SVD) on a sparse input. Return (U, S): the left singular vectors and the singular values of the input data stream `corpus` [4]_. The corpus may be larger than RAM (iterator of vectors). This may return less than the requested number of top `rank` factors, in case the input itself is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2+power_iters` passes over the input data. In case you can only afford a single pass, set `onepass=True` in :class:`LsiModel` and avoid using this function directly. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** .. [4] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. """ rank = int(rank) if extra_dims is None: samples = max(10, 2 * rank) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters)) num_terms = int(num_terms) # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunksize` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix" % str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % (m, num_terms) o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype! # so check for equal dtype explicitly, to avoid the extra memory footprint if possible if y.dtype != dtype: y = y.astype(dtype) logger.info("orthonormalizing %s action matrix" % str(y.shape)) y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range logger.debug("running %i power iterations" % power_iters) for power_iter in xrange(power_iters): q = corpus.T * q q = [corpus * q] q, _ = matutils.qr_destroy(q) # orthonormalize the range after each power iteration step else: num_docs = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i' % (chunk_no * chunksize)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunksize) matrix! s = sum(len(doc) for doc in chunk) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype(dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o chunk.data, o.ravel(), y.ravel()) del chunk, o y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range for power_iter in xrange(power_iters): logger.info("running power iteration #%i" % (power_iter + 1)) yold = q.copy() q[:] = 0.0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC tmp = chunk.T * yold tmp = chunk * tmp del chunk q += tmp del yold q = [q] q, _ = matutils.qr_destroy(q) # orthonormalize the range qt = q[:, :samples].T.copy() del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) u, s, vt = scipy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunksize` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64) logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape)) for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype) b = qt * chunk # dense * sparse matrix multiply del chunk x += numpy.dot(b, b.T) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix" % str(x.shape)) u, s, vt = scipy.linalg.svd(x) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) s = numpy.sqrt(s) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus q = qt.T.copy() del qt logger.info("computing the final decomposition") keep = clip_spectrum(s**2, rank, discard=eps) u = u[:, :keep].copy() s = s[:keep] u = numpy.dot(q, u) return u.astype(dtype), s.astype(dtype)
def add_documents(self, corpus, chunksize=None, decay=None): """ Update singular value decomposition to take into account a new corpus of documents. Training proceeds in chunks of `chunksize` documents at a time. The size of `chunksize` is a tradeoff between increased speed (bigger `chunksize`) vs. lower memory footprint (smaller `chunksize`). If the distributed mode is on, each chunk is sent to a different worker/computer. Setting `decay` < 1.0 causes re-orientation towards new data trends in the input document stream, by giving less emphasis to old observations. This allows LSA to gradually "forget" old observations (documents) and give more preference to new ones. """ logger.info("updating model with new documents") # get computation parameters; if not specified, use the ones from constructor if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo update = Projection(self.num_terms, self.num_topics, None) update.u, update.s = stochastic_svd( corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) else: # the one-pass algo doc_no = 0 for chunk_no, chunk in enumerate( utils.grouper(corpus, chunksize)): logger.info("preparing a new chunk of documents") nnz = sum(len(doc) for doc in chunk) # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! logger.debug("converting corpus to csc format") job = matutils.corpus2csc(chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz) del chunk doc_no += job.shape[1] if self.dispatcher: # distributed version: add this job to the job queue, so workers can work on it logger.debug("creating job #%i" % chunk_no) self.dispatcher.putjob( job ) # put job into queue; this will eventually block, because the queue has a small finite size del job logger.info("dispatched documents up to #%s" % doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly update = Projection(self.num_terms, self.num_topics, job, extra_dims=self.extra_samples, power_iters=self.power_iters) del job self.projection.merge(update, decay=decay) del update logger.info("processed documents up to #%s" % doc_no) self.print_topics(5) # wait for all workers to finish (distributed version only) if self.dispatcher: logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) self.projection = self.dispatcher.getstate() # logger.info("top topics after adding %i documents" % doc_no) # self.print_debug(10) else: assert not self.dispatcher, "must be in serial mode to receive jobs" assert self.onepass, "distributed two-pass algo not supported yet" update = Projection(self.num_terms, self.num_topics, corpus.tocsc(), extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents" % (corpus.shape[1]))
def train(self, sentences, total_words=None, word_count=0, paragraphs_only = False, vocab = None, paragraphs = None): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ if paragraphs is None: paragraphs = self.synparagraph if vocab is None: vocab = self.paragraph_vocab if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count, total_words = [word_count], total_words or sum(v.count for v in itervalues(self.vocab)) jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) total_error = [0.0] def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" paragraph_work = zeros(self.paragraph_size, dtype=REAL) # each thread must have its own work memory error = zeros(1, dtype = REAL) if self.concatenate: # word work here is for each individual word, so it has length logistic regression - para size word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL) else: # here word work is aggregated: word_work = zeros(self.layer1_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) zeros(self.logistic_regression_size, dtype = REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job)) with lock: # here we can store the scores for later plotting and viewing... word_count[0] += job_words elapsed = time.time() - start total_error[0] += error[0] if elapsed >= next_report[0]: logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects, and paragraph to paragraph (Vocab) object: no_oov = (self.create_job(sentence,vocab) for sentence in sentences) for job_no, job in enumerate(utils.grouper(no_oov, self.batchsize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i sentences took %.1fs, %.0f sentences/s, %.6f" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0, total_error[0])) return (word_count[0], total_error[0])
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None, power_iters=0, dtype=numpy.float64, eps=1e-6): """ Return (U, S): the left singular vectors and the singular values of the streamed input corpus `corpus` [3]_. This may actually return less than the requested number of top `rank` factors, in case the input is of lower rank. The `extra_dims` (oversampling) and especially `power_iters` (power iterations) parameters affect accuracy of the decomposition. This algorithm uses `2+power_iters` passes over the data. In case you can only afford a single pass over the input corpus, set `onepass=True` in :class:`LsiModel` and avoid using this algorithm directly. The decomposition algorithm is based on **Halko, Martinsson, Tropp. Finding structure with randomness, 2009.** .. [3] If `corpus` is a scipy.sparse matrix instead, it is assumed the whole corpus fits into core memory and a different (more efficient) code path is chosen. """ rank = int(rank) if extra_dims is None: samples = max( 10, 2 * rank ) # use more samples than requested factors, to improve accuracy else: samples = rank + int(extra_dims) logger.info("using %i extra samples and %i power iterations" % (samples - rank, power_iters)) num_terms = int(num_terms) # first phase: construct the orthonormal action matrix Q = orth(Y) = orth((A * A.T)^q * A * O) # build Y in blocks of `chunksize` documents (much faster than going one-by-one # and more memory friendly than processing all documents at once) y = numpy.zeros(dtype=dtype, shape=(num_terms, samples)) logger.info("1st phase: constructing %s action matrix" % str(y.shape)) if scipy.sparse.issparse(corpus): m, n = corpus.shape assert num_terms == m, "mismatch in number of features: %i in sparse matrix vs. %i parameter" % ( m, num_terms) o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( y.dtype) # draw a random gaussian matrix sparsetools.csc_matvecs(m, n, samples, corpus.indptr, corpus.indices, corpus.data, o.ravel(), y.ravel()) # y = corpus * o del o # unlike numpy, scipy.sparse `astype()` copies everything, even if there is no change to dtype! # so check for equal dtype explicitly, to avoid the extra memory footprint if possible if y.dtype != dtype: y = y.astype(dtype) logger.info("orthonormalizing %s action matrix" % str(y.shape)) y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range logger.debug("running %i power iterations" % power_iters) for power_iter in xrange(power_iters): q = corpus.T * q q = [corpus * q] q, _ = matutils.qr_destroy( q) # orthonormalize the range after each power iteration step else: num_docs = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i' % (chunk_no * chunksize)) # construct the chunk as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense (num_terms x chunksize) matrix! s = sum(len(doc) for doc in chunk) chunk = matutils.corpus2csc( chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC m, n = chunk.shape assert m == num_terms assert n <= chunksize # the very last chunk of A is allowed to be smaller in size num_docs += n logger.debug("multiplying chunk * gauss") o = numpy.random.normal(0.0, 1.0, (n, samples)).astype( dtype) # draw a random gaussian matrix sparsetools.csc_matvecs( m, n, samples, chunk.indptr, chunk.indices, # y = y + chunk * o chunk.data, o.ravel(), y.ravel()) del chunk, o y = [y] q, _ = matutils.qr_destroy(y) # orthonormalize the range for power_iter in xrange(power_iters): logger.info("running power iteration #%i" % (power_iter + 1)) yold = q.copy() q[:] = 0.0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc( chunk, num_terms=num_terms, dtype=dtype) # documents = columns of sparse CSC tmp = chunk.T * yold tmp = chunk * tmp del chunk q += tmp del yold q = [q] q, _ = matutils.qr_destroy(q) # orthonormalize the range qt = q[:, :samples].T.copy() del q if scipy.sparse.issparse(corpus): b = qt * corpus logger.info("2nd phase: running dense svd on %s matrix" % str(b.shape)) u, s, vt = numpy.linalg.svd(b, full_matrices=False) del b, vt else: # second phase: construct the covariance matrix X = B * B.T, where B = Q.T * A # again, construct X incrementally, in chunks of `chunksize` documents from the streaming # input corpus A, to avoid using O(number of documents) memory x = numpy.zeros(shape=(qt.shape[0], qt.shape[0]), dtype=numpy.float64) logger.info("2nd phase: constructing %s covariance matrix" % str(x.shape)) for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info('PROGRESS: at document #%i/%i' % (chunk_no * chunksize, num_docs)) chunk = matutils.corpus2csc(chunk, num_terms=num_terms, dtype=qt.dtype) b = qt * chunk # dense * sparse matrix multiply del chunk x += numpy.dot( b, b.T ) # TODO should call the BLAS routine SYRK, but there is no SYRK wrapper in scipy :( del b # now we're ready to compute decomposition of the small matrix X logger.info("running dense decomposition on %s covariance matrix" % str(x.shape)) u, s, vt = numpy.linalg.svd( x ) # could use linalg.eigh, but who cares... and svd returns the factors already sorted :) s = numpy.sqrt( s ) # sqrt to go back from singular values of X to singular values of B = singular values of the corpus q = qt.T.copy() del qt logger.info("computing the final decomposition") keep = clip_spectrum(s**2, rank, discard=eps) u = u[:, :keep].copy() s = s[:keep] u = numpy.dot(q, u) return u.astype(dtype), s.astype(dtype)
def add_documents(self, corpus, chunksize=None, decay=None): """ Update singular value decomposition to take into account a new corpus of documents. Training proceeds in chunks of `chunksize` documents at a time. The size of `chunksize` is a tradeoff between increased speed (bigger `chunksize`) vs. lower memory footprint (smaller `chunksize`). If the distributed mode is on, each chunk is sent to a different worker/computer. Setting `decay` < 1.0 causes re-orientation towards new data trends in the input document stream, by giving less emphasis to old observations. This allows LSA to gradually "forget" old observations (documents) and give more preference to new ones. """ logger.info("updating model with new documents") # get computation parameters; if not specified, use the ones from constructor if chunksize is None: chunksize = self.chunksize if decay is None: decay = self.decay if not scipy.sparse.issparse(corpus): if not self.onepass: # we are allowed multiple passes over the input => use a faster, randomized two-pass algo update = Projection(self.num_terms, self.num_topics, None) update.u, update.s = stochastic_svd(corpus, self.num_topics, num_terms=self.num_terms, chunksize=chunksize, extra_dims=self.extra_samples, power_iters=self.power_iters) self.projection.merge(update, decay=decay) else: # the one-pass algo doc_no = 0 ##### counters for jobs count_sent = 0 count_recv = 0 for chunk_no, chunk in enumerate(utils.grouper(corpus, chunksize)): logger.info("preparing a new chunk of documents") nnz = sum(len(doc) for doc in chunk) # construct the job as a sparse matrix, to minimize memory overhead # definitely avoid materializing it as a dense matrix! logger.debug("converting corpus to csc format") job = matutils.corpus2csc(chunk, num_docs=len(chunk), num_terms=self.num_terms, num_nnz=nnz) del chunk doc_no += job.shape[1] ##### distributed version if self.dispatcher: ##### store the comm size and prepare status num_workers = self.comm.Get_size() - 1 status = MPI.Status() ##### time to send some jobs logger.debug("creating job #%i" % chunk_no) count_sent += 1 ##### send the initial batch if (chunk_no < num_workers): self.comm.send(job, dest=chunk_no+1) ##### wait around for ready workers else: self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) source = status.Get_source() count_recv += 1 self.comm.send(job, dest=source) del job logger.info("dispatched documents up to #%s" % doc_no) else: # serial version, there is only one "worker" (myself) => process the job directly update = Projection(self.num_terms, self.num_topics, job) del job self.projection.merge(update, decay=decay) del update logger.info("processed documents up to #%s" % doc_no) self.print_topics(5) ##### wait for all workers to finish (distributed version only) if self.dispatcher: logger.info("reached the end of input; now waiting for all remaining jobs to finish") ##### workers are finishing up while (count_recv < count_sent): self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) count_recv += 1 ##### placeholder for the result result = None result_recv = 0 ##### send the kill messages for i in xrange(num_workers): self.comm.send(None, dest=i+1) ##### wait for all results while (result_recv < num_workers): r = self.comm.recv(source=MPI.ANY_SOURCE, tag=MPI.ANY_TAG, status=status) result_recv += 1 if result_recv == 1: result = r else: result.merge(r) logger.info("finished merging projections") self.projection = result # logger.info("top topics after adding %i documents" % doc_no) # self.print_debug(10) else: assert not self.dispatcher, "must be in serial mode to receive jobs" assert self.onepass, "distributed two-pass algo not supported yet" update = Projection(self.num_terms, self.num_topics, corpus.tocsc()) self.projection.merge(update, decay=decay) logger.info("processed sparse job of %i documents" % (corpus.shape[1]))
def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, decay=None, offset=None, passes=None, update_every=None, eval_every=None, iterations=None, gamma_threshold=None, chunks_as_numpy=False): """ Train the model with new documents, by EM-iterating over `corpus` until the topics converge (or until the maximum number of allowed iterations is reached). `corpus` must be an iterable (repeatable stream of documents), This update also supports updating an already trained model (`self`) with new documents from `corpus`; the two models are then merged in proportion to the number of old vs. new documents. This feature is still experimental for non-stationary input streams. For stationary input (no topic drift in new documents), on the other hand, this equals the online update of Hoffman et al. and is guaranteed to converge for any `decay` in (0.5, 1.0>. Additionally, for smaller `corpus` sizes, an increasing `offset` may be beneficial (see Table 1 in Hoffman et al.) If update is called with authors that already exist in the model, it will resume training on not only new documents for that author, but also the previously seen documents. This is necessary for those authors' topic distributions to converge. Every time `update(corpus, author2doc)` is called, the new documents are to appended to all the previously seen documents, and author2doc is combined with the previously seen authors. To resume training on all the data seen by the model, simply call `update()`. It is not possible to add new authors to existing documents, as all documents in `corpus` are assumed to be new documents. Args: corpus (gensim corpus): The corpus with which the author-topic model should be updated. author2doc (dictionary): author to document mapping corresponding to indexes in input corpus. doc2author (dictionary): document to author mapping corresponding to indexes in input corpus. chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np array of not. np can in some settings turn the term IDs into floats, these will be converted back into integers in inference, which incurs a performance hit. For distributed computing it may be desirable to keep the chunks as np arrays. For other parameter settings, see :class:`AuthorTopicModel` constructor. """ # use parameters given in constructor, unless user explicitly overrode them if decay is None: decay = self.decay if offset is None: offset = self.offset if passes is None: passes = self.passes if update_every is None: update_every = self.update_every if eval_every is None: eval_every = self.eval_every if iterations is None: iterations = self.iterations if gamma_threshold is None: gamma_threshold = self.gamma_threshold # TODO: if deepcopy is not used here, something goes wrong. When unit tests are run (specifically "testPasses"), # the process simply gets killed. author2doc = deepcopy(author2doc) doc2author = deepcopy(doc2author) # TODO: it is not possible to add new authors to an existing document (all input documents are treated # as completely new documents). Perhaps this functionality could be implemented. # If it's absolutely necessary, the user can delete the documents that have new authors, and call update # on them with the new and old authors. if corpus is None: # Just keep training on the already available data. # Assumes self.update() has been called before with input documents and corresponding authors. assert self.total_docs > 0, 'update() was called with no documents to train on.' train_corpus_idx = [d for d in xrange(self.total_docs)] num_input_authors = len(self.author2doc) else: if doc2author is None and author2doc is None: raise ValueError( 'at least one of author2doc/doc2author must be specified, to establish input space dimensionality' ) # If either doc2author or author2doc is missing, construct them from the other. if doc2author is None: doc2author = construct_doc2author(corpus, author2doc) elif author2doc is None: author2doc = construct_author2doc(doc2author) # Number of authors that need to be updated. num_input_authors = len(author2doc) try: len_input_corpus = len(corpus) except TypeError: logger.warning( "input corpus stream has no len(); counting documents") len_input_corpus = sum(1 for _ in corpus) if len_input_corpus == 0: logger.warning( "AuthorTopicModel.update() called with an empty corpus") return self.total_docs += len_input_corpus # Add new documents in corpus to self.corpus. self.extend_corpus(corpus) # Obtain a list of new authors. new_authors = [] # Sorting the author names makes the model more reproducible. for a in sorted(author2doc.keys()): if not self.author2doc.get(a): new_authors.append(a) num_new_authors = len(new_authors) # Add new authors do author2id/id2author dictionaries. for a_id, a_name in enumerate(new_authors): self.author2id[a_name] = a_id + self.num_authors self.id2author[a_id + self.num_authors] = a_name # Increment the number of total authors seen. self.num_authors += num_new_authors # Initialize the variational distributions q(theta|gamma) gamma_new = self.random_state.gamma( 100., 1. / 100., (num_new_authors, self.num_topics)) self.state.gamma = np.vstack([self.state.gamma, gamma_new]) # Combine author2doc with self.author2doc. # First, increment the document IDs by the number of previously seen documents. for a, doc_ids in author2doc.items(): doc_ids = [ d + self.total_docs - len_input_corpus for d in doc_ids ] # For all authors in the input corpus, add the new documents. for a, doc_ids in author2doc.items(): if self.author2doc.get(a): # This is not a new author, append new documents. self.author2doc[a].extend(doc_ids) else: # This is a new author, create index. self.author2doc[a] = doc_ids # Add all new documents to self.doc2author. for d, a_list in doc2author.items(): self.doc2author[d] = a_list # Train on all documents of authors in input_corpus. train_corpus_idx = [] for _ in author2doc.keys(): # For all authors in input corpus. for doc_ids in self.author2doc.values( ): # For all documents in total corpus. train_corpus_idx.extend(doc_ids) # Make the list of training documents unique. train_corpus_idx = list(set(train_corpus_idx)) # train_corpus_idx is only a list of indexes, so "len" is valid. lencorpus = len(train_corpus_idx) if chunksize is None: chunksize = min(lencorpus, self.chunksize) self.state.numdocs += lencorpus if update_every: updatetype = "online" updateafter = min(lencorpus, update_every * self.numworkers * chunksize) else: updatetype = "batch" updateafter = lencorpus evalafter = min(lencorpus, (eval_every or 0) * self.numworkers * chunksize) updates_per_pass = max(1, lencorpus / updateafter) logger.info( "running %s author-topic training, %s topics, %s authors, %i passes over the supplied corpus of %i documents, updating model once " "every %i documents, evaluating perplexity every %i documents, iterating %ix with a convergence threshold of %f", updatetype, self.num_topics, num_input_authors, passes, lencorpus, updateafter, evalafter, iterations, gamma_threshold) if updates_per_pass * passes < 10: logger.warning( "too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy" ) # rho is the "speed" of updating; TODO try other fncs # pass_ + num_updates handles increasing the starting t for each pass, # while allowing it to "reset" on the first pass of each update def rho(): return pow(offset + pass_ + (self.num_updates / chunksize), -decay) for pass_ in xrange(passes): if self.dispatcher: logger.info('initializing %s workers', self.numworkers) self.dispatcher.reset(self.state) else: # gamma is not needed in "other", thus its shape is (0, 0). other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0)) dirty = False reallen = 0 for chunk_no, chunk_doc_idx in enumerate( utils.grouper(train_corpus_idx, chunksize, as_numpy=chunks_as_numpy)): chunk = [self.corpus[d] for d in chunk_doc_idx] reallen += len( chunk ) # keep track of how many documents we've processed so far if eval_every and ((reallen == lencorpus) or ((chunk_no + 1) % (eval_every * self.numworkers) == 0)): # log_perplexity requires the indexes of the documents being evaluated, to know what authors # correspond to the documents. self.log_perplexity(chunk, chunk_doc_idx, total_docs=lencorpus) if self.dispatcher: # add the chunk to dispatcher's job queue, so workers can munch on it logger.info( "PROGRESS: pass %i, dispatching documents up to #%i/%i", pass_, chunk_no * chunksize + len(chunk), lencorpus) # this will eventually block until some jobs finish, because the queue has a small finite length self.dispatcher.putjob(chunk) else: logger.info("PROGRESS: pass %i, at document #%i/%i", pass_, chunk_no * chunksize + len(chunk), lencorpus) # do_estep requires the indexes of the documents being trained on, to know what authors # correspond to the documents. gammat = self.do_estep(chunk, self.author2doc, self.doc2author, rho(), other, chunk_doc_idx) if self.optimize_alpha: self.update_alpha(gammat, rho()) dirty = True del chunk # perform an M step. determine when based on update_every, don't do this after every chunk if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0: if self.dispatcher: # distributed mode: wait for all workers to finish logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) other = self.dispatcher.getstate() self.do_mstep(rho(), other, pass_ > 0) del other # frees up memory if self.dispatcher: logger.info('initializing workers') self.dispatcher.reset(self.state) else: other = AuthorTopicState(self.eta, self.state.sstats.shape, (0, 0)) dirty = False # endfor single corpus iteration if reallen != lencorpus: raise RuntimeError( "input corpus size changed during training (don't use generators as input)" ) if dirty: # finish any remaining updates if self.dispatcher: # distributed mode: wait for all workers to finish logger.info( "reached the end of input; now waiting for all remaining jobs to finish" ) other = self.dispatcher.getstate() self.do_mstep(rho(), other, pass_ > 0) del other
def train(self, sentences, total_words=None, word_count=0, chunksize=100): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ if FAST_VERSION < 0: import warnings warnings.warn( "Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`" ) logger.info( "training model with %i workers on %i vocabulary and %i features" % (self.workers, len(self.vocab), self.layer1_size) ) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count, total_words = [word_count], total_words or sum(v.count for v in self.vocab.itervalues()) jobs = Queue( maxsize=2 * self.workers ) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # each thread must have its own work memory while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = sum(train_sentence(self, sentence, alpha, work) for sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info( "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0) ) next_report[0] = ( elapsed + 1.0 ) # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects (or None for OOV words), and start filling the jobs queue no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences) for job_no, job in enumerate(utils.grouper(no_oov, chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info( "training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0) ) return word_count[0]
def train(self, input_file=None, total_words=None, word_count=0, chunksize=100 , alpha=0.025, alpha_doc=0.025,sentences_length=None): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ if FAST_VERSION < 0: import warnings warnings.warn("Cython compilation failed, training will be slow. Do you have Cython installed? `pip install cython`") logger.info("training model with %i workers on %i vocabulary and %i features, " "using 'skipgram'=%s 'hierarchical softmax'=%s 'subsample'=%s and 'negative sampling'=%s" % (self.workers, len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative)) if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") '''学習率を設定''' self.alpha_doc = float(alpha_doc) self.alpha = float(alpha) start, next_report = time.time(), [1.0] word_count = [word_count] total_words = total_words or int(sum(v.count * v.sample_probability for v in itervalues(self.vocab))) jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros(self.syn1_size, dtype=REAL) # each thread must have its own work memory # neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) neu1 = zeros(self.syn1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.alpha_flag == 1: alpha = self.alpha # print "alpha", alpha # how many words did we train on? out-of-vocabulary (unknown) words do not count if self.sg: if self.skip_gram_type == 0: # job_id = 0 # sentence_id_,_ = job[job_id] # print "py sentence_id = ",sentence_id_ # bf = deepcopy(self.doc[sentence_id_][0]) # print "bf : ",bf job_words = sum(train_sentence_sg_simple(self, sentence_id,sentence, alpha, work,self.alpha_doc) for sentence_id,sentence in job) # print "af : ",self.doc[sentence_id_][0] # print "re : ", self.doc[sentence_id_][0] - bf elif self.skip_gram_type == 1: # ids_back = [sentence_id for sentence_id,_ in job] # bf_ = deepcopy(self.doc[ids_back]) job_words = sum(train_sentence_sg_average(self, sentence_id,sentence, alpha, work, neu1 ,self.alpha_doc) for sentence_id,sentence in job) # af_ = self.doc[ids_back] # print numpy.mean(af_ - bf_ ) elif self.skip_gram_type == 2: ids_back = [sentence_id for sentence_id,_ in job] bf_ = deepcopy(self.doc[ids_back]) job_words = sum(train_sentence_sg_concat(self, sentence_id,sentence, alpha, work,neu1,self.alpha_doc) for sentence_id,sentence in job) af_ = self.doc[ids_back] print numpy.mean(af_ - bf_ ) elif self.cbow_type == 4: ids_back = [sentence_id for sentence_id,_ in job] bf_ = deepcopy(self.doc[ids_back]) # job_words = sum(train_sentence_cbow_average_plus_doc_vec_extra_train(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job) job_words = sum(train_sentence_cbow_average_plus_doc(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job) af_ = self.doc[ids_back] print numpy.mean(af_ - bf_ ) # print "re : ", af_ - bf_ # elif self.cbow_type == 5: # job_words = sum(train_sentence_cbow_concatenate_v2(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job) elif self.cbow_type == 3: job_id = 0 ids_back = [sentence_id for sentence_id,_ in job] bf_ = deepcopy(self.doc[ids_back]) sentence_id_,sentence_ = job[job_id] # # print "py sentence_id = ",sentence_id_ # bf = deepcopy(self.doc[sentence_id_]) # print "bf : ",bf # print "null_vec", self.null_vec job_words = sum(train_sentence_cbow_concatenate(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job) # af_ = self.doc[ids_back] # print numpy.mean(af_ - bf_ ) # print "af : ",self.doc[sentence_id_] # print "re : ", self.doc[sentence_id_] - bf # print sum(self.doc[sentence_id_] - bf) elif self.cbow_type == 2: job_words = sum(train_sentence_cbow_concatenate_syn1_doc(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job) elif self.cbow_type == 1: ids_back = [sentence_id for sentence_id,_ in job] bf_ = deepcopy(self.doc[ids_back]) job_words = sum(train_sentence_cbow_average_simple(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job) af_ = self.doc[ids_back] print numpy.mean(af_ - bf_ ) # print af_ - bf_ elif self.cbow_type == 0: job_words = sum(train_sentence_cbow_syn1_doc(self, sentence_id,sentence, alpha, work, neu1,self.alpha_doc) for sentence_id,sentence in job) with lock: word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.info("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() def prepare_sentences(): ''' ここでsentencesのindexをradom.shuffleしランダムに学習していく ''' if self.random_learn_flag: # ランダムに入力データをシャッフルして学習する indexes_sentence_ids = numpy.array(range(sentences_length)) random.shuffle(indexes_sentence_ids, lambda: random_seed) sentences = [(indexes_sentence_ids[index],sentence) for index,sentence in enumerate(open(input_file))] else: sentences = enumerate(open(input_file)) for sentence_id, sentence in sentences: sentence = sentence.split(u" ") # 途中まで学習している場合はスキップする(学習済モデルから追加で学習する場合) if sentence_id < self.skip_id: print "skip! :"+str(sentence_id) +" "+str(self.skip_id) continue sampled = [self.vocab[word] for word in sentence if word in self.vocab and (self.vocab[word].sample_probability >= 1.0 or self.vocab[word].sample_probability >= numpy.random.random_sample())] yield (sentence_id,sampled) # no_oov = ([self.vocab.get(word, None) for word in sentence] for sentence in sentences) # convert input strings to Vocab objects (eliding OOV/downsampled words), and start filling the jobs queue for job_no, job in enumerate(utils.grouper(prepare_sentences(), chunksize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i words took %.1fs, %.0f words/s" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0)) return word_count[0]