def __init__(self, sentences, epochs, total_examples, batch_size=10000, maxsize=5):
        self.sentences = sentences
        self.batch_size = batch_size
        self.index = 0
        self.current_epoch = 0
        self.total_examples = total_examples
        self.queue = Queue(maxsize=maxsize)
        self.closed_queue = False
        self.epochs = epochs

        if epochs > 1:      
            self.sentences = utils.RepeatCorpusNTimes(sentences, epochs)

        def _batchFiller():
            counter = 0
            batch = list()
            for l in self.sentences:
                batch.append(l)
                counter += 1
                if len(batch) == batch_size:
                    self.queue.put(batch, block=True)
                    batch = list()
                if counter >= total_examples:
                    self.current_epoch += 1
                    counter = 0
            self.queue.put(batch, block=True)
            self.queue.put(None)
            self.closed_queue = True
            print('Closed queue, epoch #: ' + str(self.current_epoch))

        self.filler_thread = threading.Thread(target=_batchFiller) 
        self.filler_thread.daemon = True
        self.filler_thread.start()
예제 #2
0
    def __init__(self,
                 sentences=None,
                 size=100,
                 alpha=0.025,
                 window=5,
                 min_count=5,
                 sample=0,
                 seed=1,
                 workers=1,
                 min_alpha=0.0001,
                 sg=1,
                 hs=1,
                 negative=0,
                 cbow_mean=0,
                 hashfxn=hash,
                 iter=1):
        """
        Initialize the model from an iterable of `sentences`. Each sentence is a
        list of words (unicode strings) that will be used for training.

        The `sentences` iterable can be simply a list, but for larger corpora,
        consider an iterable that streams the sentences directly from disk/network.
        See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in
        this module for such examples.

        If you don't supply `sentences`, the model is left uninitialized -- use if
        you plan to initialize it in some other way.

        `sg` defines the training algorithm. By default (`sg=1`), skip-gram is used. Otherwise, `cbow` is employed.

        `size` is the dimensionality of the feature vectors.

        `window` is the maximum distance between the current and predicted word within a sentence.

        `alpha` is the initial learning rate (will linearly drop to zero as training progresses).

        `seed` = for the random number generator. Initial vectors for each
        word are seeded with a hash of the concatenation of word + str(seed).

        `min_count` = ignore all words with total frequency lower than this.

        `sample` = threshold for configuring which higher-frequency words are randomly downsampled;
            default is 0 (off), useful value is 1e-5.

        `workers` = use this many worker threads to train the model (=faster training with multicore machines).

        `hs` = if 1 (default), hierarchical sampling will be used for model training (else set to 0).

        `negative` = if > 0, negative sampling will be used, the int for negative
        specifies how many "noise words" should be drawn (usually between 5-20).

        `cbow_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean.
        Only applies when cbow is used.

        `hashfxn` = hash function to use to randomly initialize weights, for increased
        training reproducibility. Default is Python's rudimentary built in hash function.

        `iter` = number of iterations (epochs) over the corpus.

        """
        self.vocab = {}  # mapping from a word (string) to a Vocab object
        self.index2word = [
        ]  # map from a word's matrix index (int) to word (string)
        self.sg = int(sg)
        self.table = None  # for negative sampling --> this needs a lot of RAM! consider setting back to None before saving
        self.layer1_size = int(size)
        if size % 4 != 0:
            logger.warning(
                "consider setting layer size to a multiple of 4 for greater performance"
            )
        self.alpha = float(alpha)
        self.window = int(window)
        self.seed = seed
        self.min_count = min_count
        self.sample = sample
        self.workers = workers
        self.min_alpha = min_alpha
        self.hs = hs
        self.negative = negative
        self.cbow_mean = int(cbow_mean)
        self.hashfxn = hashfxn
        self.iter = iter
        if sentences is not None:
            self.build_vocab(sentences)
            sentences = utils.RepeatCorpusNTimes(sentences, iter)
            self.train(sentences)
예제 #3
0
    def __init__(self, dirname):
        self.dirname = dirname

    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname, fname)):
                yield line.split()

number_iter = 1
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

jsonPath = '/home/ksoo/Downloads/scrapy_sample/scrapy_sample/corpus/'
saveModelName = 'gensimdata'
sentences = LineSentence(jsonPath + 'cp_copy.json')
try:
        model = word2vec.Word2Vec.load(saveModelName + '.model')
except:
        print "새로 학습"
        
        model = word2vec.Word2Vec(size=30, window = 8, workers=8)
        model.build_vocab(sentences)

ss = utils.RepeatCorpusNTimes(sentences, number_iter)
model.train(ss)

model.save(saveModelName + '.model')
model.save_word2vec_format(saveModelName + '.bin', binary=True)



    def train(self,
              sentences,
              total_words=None,
              word_count=0,
              total_examples=None,
              queue_factor=2,
              report_delay=0.1):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)
        To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples
        (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the
        sentences are the same as those that were used to initially build the vocabulary.
        """
        self.loss = {}
        if FAST_VERSION < 0:
            import warnings
            warnings.warn(
                "C extension not loaded for Word2Vec, training will be slow. "
                "Install a C compiler and reinstall gensim for fast training.")
            self.neg_labels = []
            if self.negative > 0:
                # precompute negative labels optimization for pure-python training
                self.neg_labels = zeros(self.negative + 1)
                self.neg_labels[0] = 1.

        logger.info(
            "training model with %i workers on %i vocabulary and %i features, "
            "using sg=%s hs=%s sample=%s negative=%s", self.workers,
            len(self.vocab), self.layer1_size, self.sg, self.hs, self.sample,
            self.negative)

        if not self.vocab:
            raise RuntimeError(
                "you must first build vocabulary before training the model")
        if not hasattr(self, 'syn0'):
            raise RuntimeError(
                "you must first finalize vocabulary before training the model")

        if total_words is None and total_examples is None:
            if self.corpus_count:
                total_examples = self.corpus_count
                logger.info(
                    "expecting %i sentences, matching count from corpus used for vocabulary survey",
                    total_examples)
            else:
                raise ValueError(
                    "you must provide either total_words or total_examples, to enable alpha and progress calculations"
                )

        job_tally = 0

        if self.iter > 1:
            sentences = utils.RepeatCorpusNTimes(sentences, self.iter)
            total_words = total_words and total_words * self.iter
            total_examples = total_examples and total_examples * self.iter

        def worker_loop():
            """Train the model, lifting lists of sentences from the job_queue."""
            work = matutils.zeros_aligned(
                self.layer1_size, dtype=REAL)  # per-thread private work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            jobs_processed = 0
            while True:
                job = job_queue.get()
                if job is None:
                    progress_queue.put(None)
                    break  # no more jobs => quit this worker
                sentences, alpha = job
                tally, raw_tally = self._do_train_job(sentences, alpha,
                                                      (work, neu1))
                progress_queue.put(
                    (len(sentences), tally, raw_tally))  # report back progress
                jobs_processed += 1
#             logger.debug("worker exiting, processed %i jobs", jobs_processed)

        def job_producer():
            """Fill jobs queue using the input `sentences` iterator."""
            job_batch, batch_size = [], 0
            pushed_words, pushed_examples = 0, 0
            next_alpha = self.alpha
            job_no = 0

            for sent_idx, sentence in enumerate(sentences):
                sentence_length = self._raw_word_count([sentence])

                # can we fit this sentence into the existing job batch?
                if batch_size + sentence_length <= self.batch_words:
                    # yes => add it to the current job
                    job_batch.append(sentence)
                    batch_size += sentence_length
                else:
                    # no => submit the existing job
                    #logger.debug(
                    #    "queueing job #%i (%i words, %i sentences) at alpha %.05f",
                    #    job_no, batch_size, len(job_batch), next_alpha)
                    job_no += 1
                    job_queue.put((job_batch, next_alpha))

                    # update the learning rate for the next job
                    if self.min_alpha < next_alpha:
                        if total_examples:
                            # examples-based decay
                            pushed_examples += len(job_batch)
                            progress = 1.0 * pushed_examples / total_examples
                        else:
                            # words-based decay
                            pushed_words += self._raw_word_count(job_batch)
                            progress = 1.0 * pushed_words / total_words
                        next_alpha = self.alpha - (self.alpha -
                                                   self.min_alpha) * progress
                        next_alpha = max(self.min_alpha, next_alpha)

                    # add the sentence that didn't fit as the first item of a new job
                    job_batch, batch_size = [sentence], sentence_length

            # add the last job too (may be significantly smaller than batch_words)
            if job_batch:
                #                 logger.debug(
                #                     "queueing job #%i (%i words, %i sentences) at alpha %.05f",
                #                     job_no, batch_size, len(job_batch), next_alpha)
                job_no += 1
                job_queue.put((job_batch, next_alpha))

            if job_no == 0 and self.train_count == 0:
                logger.warning(
                    "train() called with an empty iterator (if not intended, "
                    "be sure to provide a corpus that offers restartable "
                    "iteration = an iterable).")

            # give the workers heads up that they can finish -- no more work!
            for _ in xrange(self.workers):
                job_queue.put(None)
            logger.debug("job loop exiting, total %i jobs", job_no)

        # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        job_queue = Queue(maxsize=queue_factor * self.workers)
        progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

        workers = [
            threading.Thread(target=worker_loop) for _ in xrange(self.workers)
        ]
        unfinished_worker_count = len(workers)
        workers.append(threading.Thread(target=job_producer))

        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        example_count, trained_word_count, raw_word_count = 0, 0, word_count
        start, next_report = default_timer() - 0.00001, 1.0
        prev_example_count = 0

        while unfinished_worker_count > 0:
            report = progress_queue.get()  # blocks if workers too slow
            if report is None:  # a thread reporting that it finished
                unfinished_worker_count -= 1
                #                 logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count)
                continue
            examples, trained_words, raw_words = report
            job_tally += 1

            # update progress stats
            example_count += examples
            trained_word_count += trained_words  # only words in vocab & sampled
            raw_word_count += raw_words

            # log progress once every report_delay seconds
            elapsed = default_timer() - start
            if elapsed >= next_report:
                next_report = elapsed + report_delay

        # all done; report the final stats
        elapsed = default_timer() - start
        logger.info(
            "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s",
            raw_word_count, trained_word_count, elapsed,
            trained_word_count / elapsed)
        if job_tally < 10 * self.workers:
            logger.warn(
                "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay"
            )

        # check that the input corpus hasn't changed during iteration
        if total_examples and total_examples != example_count:
            logger.warn(
                "supplied example count (%i) did not equal expected count (%i)",
                example_count, total_examples)
        if total_words and total_words != raw_word_count:
            logger.warn(
                "supplied raw word count (%i) did not equal expected count (%i)",
                raw_word_count, total_words)

        self.train_count += 1  # number of times train() has been called
        self.total_train_time += elapsed
        self.clear_sims()

        return trained_word_count
예제 #5
0
    def train(self,
              sentences,
              total_words=None,
              word_count=0,
              chunksize=100,
              total_examples=None,
              queue_factor=2,
              report_delay=1):
        """
        Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
        For FastSent, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)

        To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples
        (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the
        sentences are the same as those that were used to initially build the vocabulary.

        """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn(
                "C extension not loaded for FastSent, training will be slow. "
                "Install a C compiler and reinstall gensim for fast training.")
            self.neg_labels = []

        logger.info(
            "training model with %i workers on %i vocabulary and %i features, "
            "using sample=%s", self.workers, len(self.vocab), self.layer1_size,
            self.sample)

        if not self.vocab:
            raise RuntimeError(
                "you must first build vocabulary before training the model")
        if not hasattr(self, 'syn0'):
            raise RuntimeError(
                "you must first finalize vocabulary before training the model")

        if total_words is None and total_examples is None:
            if self.corpus_count:
                total_examples = self.corpus_count
                logger.info(
                    "expecting %i examples, matching count from corpus used for vocabulary survey",
                    total_examples)
            else:
                raise ValueError(
                    "you must provide either total_words or total_examples, to enable alpha and progress calculations"
                )

        if self.iter > 1:
            sentences = utils.RepeatCorpusNTimes(sentences, self.iter)
            total_words = total_words and total_words * self.iter
            total_examples = total_examples and total_examples * self.iter

        def worker_init():
            work = matutils.zeros_aligned(
                self.layer1_size, dtype=REAL)  # per-thread private work memory
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
            return (work, neu1)

        def worker_one_job(job, inits):
            items, alpha = job
            if items is None:  # signal to finish
                return False
            # train & return tally
            tally, raw_tally = self._do_train_job(items, alpha, inits)
            progress_queue.put(
                (len(items), tally, raw_tally))  # report progress
            return True

        # loop of a given worker: fetches the data from the queue and then
        # launches the worker_one_job function
        def worker_loop():
            """Train the model, lifting lists of sentences from the jobs queue."""
            init = worker_init()
            while True:
                job = job_queue.get()
                if not worker_one_job(job, init):
                    break

        start, next_report = default_timer(), 1.0

        # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :(
        if self.workers > 0:
            job_queue = Queue(maxsize=queue_factor * self.workers)
        else:
            job_queue = FakeJobQueue(worker_init, worker_one_job)
        progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

        workers = [
            threading.Thread(target=worker_loop) for _ in xrange(self.workers)
        ]
        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()
        pushed_words = 0
        pushed_examples = 0
        example_count = 0
        trained_word_count = 0
        raw_word_count = word_count
        push_done = False
        done_jobs = 0
        next_alpha = self.alpha
        jobs_source = enumerate(utils.grouper(sentences, chunksize))
        # fill jobs queue with (sentence, alpha) job tuples
        while True:
            try:
                job_no, items = next(jobs_source)
                logger.debug("putting job #%i in the queue at alpha %.05f",
                             job_no, next_alpha)
                job_queue.put((items, next_alpha))
                # update the learning rate before every next job
                if self.min_alpha < next_alpha:
                    if total_examples:
                        # examples-based decay
                        pushed_examples += len(items)
                        next_alpha = self.alpha - (
                            self.alpha - self.min_alpha) * (pushed_examples /
                                                            total_examples)
                    else:
                        # words-based decay
                        pushed_words += self._raw_word_count(items)
                        next_alpha = self.alpha - (
                            self.alpha - self.min_alpha) * (pushed_words /
                                                            total_words)
                    next_alpha = max(next_alpha, self.min_alpha)
            except StopIteration:
                logger.info(
                    "reached end of input; waiting to finish %i outstanding jobs",
                    job_no - done_jobs + 1)
                for _ in xrange(self.workers):
                    job_queue.put(
                        (None, 0)
                    )  # give the workers heads up that they can finish -- no more work!
                push_done = True
            try:
                while done_jobs < (job_no + 1) or not push_done:
                    examples, trained_words, raw_words = progress_queue.get(
                        push_done)  # only block after all jobs pushed
                    example_count += examples
                    trained_word_count += trained_words  # only words in vocab & sampled
                    raw_word_count += raw_words
                    done_jobs += 1
                    elapsed = default_timer() - start
                    if elapsed >= next_report:
                        if total_examples:
                            # examples-based progress %
                            logger.info(
                                "FASTSENT MODEL PROGRESS: at %.2f%% examples, %.0f words/s",
                                100.0 * example_count / total_examples,
                                trained_word_count / elapsed)
                        else:
                            # words-based progress %
                            logger.info(
                                "FASTSENT MODEL PROGRESS: at %.2f%% words, %.0f words/s",
                                100.0 * raw_word_count / total_words,
                                trained_word_count / elapsed)
                        next_report = elapsed + report_delay  # don't flood log, wait report_delay seconds
                else:
                    # loop ended by job count; really done
                    break
            except Empty:
                pass  # already out of loop; continue to next push

        elapsed = default_timer() - start
        logger.info(
            "training on %i raw words took %.1fs, %.0f trained words/s",
            raw_word_count, elapsed,
            trained_word_count / elapsed if elapsed else 0.0)

        if total_examples and total_examples != example_count:
            logger.warn(
                "supplied example count (%i) did not equal expected count (%i)",
                example_count, total_examples)
        if total_words and total_words != raw_word_count:
            logger.warn(
                "supplied raw word count (%i) did not equal expected count (%i)",
                raw_word_count, total_words)

        self.train_count += 1  # number of times train() has been called
        self.total_train_time += elapsed
        self.clear_sims()
        return trained_word_count
예제 #6
0
    def train(self,
              train_pairs,
              total_ptrees=None,
              ptree_count=0,
              total_examples=None,
              queue_factor=2,
              report_delay=1.0):
        """
    An example is a pair of a complete AST tree and an utterance.
    A ptree is a partial tree.
    """
        if FAST_VERSION < 0:
            import warnings
            warnings.warn("C extension not loaded; training will be slow.")
            # precompute negative labels optimization for pure-python training
            self.neg_labels = zeros(self.negative + 1 +
                                    NEG_SAMPLING_VOCAB_SIZE_THRESHOLD)
            self.neg_labels[0] = 1.

        logger.info("training model with %i workers", self.workers)

        if not self.vocab_i or not self.vocab_k or not self.vocab_l or not self.vocab_r:
            raise RuntimeError(
                "you must first build vocabulary before training the model")
        if not hasattr(self, 'syn0i') or not hasattr(
                self, 'syn0k') or not hasattr(self, 'syn0l'):
            raise RuntimeError(
                "you must first finalize vocabulary before training the model")

        if total_ptrees is None and total_examples is None:
            if self.corpus_count:
                total_examples = self.corpus_count
                logger.info(
                    "expecting %i train pairs, matching count from corpus used for vocabulary survey",
                    total_examples)
            else:
                raise ValueError(
                    "you must provide either total_ptrees or total_examples, to enable alpha and progress calculations"
                )

        job_tally = 0

        if self.iter > 1:
            train_pairs = gsutils.RepeatCorpusNTimes(train_pairs, self.iter)
            total_ptrees = total_ptrees and total_ptrees * self.iter
            total_examples = total_examples and total_examples * self.iter

        def worker_loop():
            """Train the model, lifting lists of train_pairs from the job_queue."""

            # per-thread private work memory - useless in numpy implementation
            work = matutils.zeros_aligned(self.vector_size, dtype=REAL)
            neu1 = matutils.zeros_aligned(self.vector_size, dtype=REAL)
            jobs_processed = 0
            while True:
                job = job_queue.get()
                if job is None:
                    progress_queue.put(None)
                    break  # no more jobs => quit this worker
                train_pairs, alpha = job
                tally, raw_tally = self._do_train_job(train_pairs, alpha,
                                                      (work, neu1))
                progress_queue.put((len(train_pairs), tally,
                                    raw_tally))  # report back progress
                jobs_processed += 1
            logger.debug("worker exiting, processed %i jobs", jobs_processed)

        def job_producer():
            """Fill jobs queue using the input `train_pairs` iterator."""
            job_batch, batch_size = [], 0
            pushed_ptrees, pushed_examples = 0, 0
            next_alpha = self.alpha
            job_no = 0

            for train_pair in train_pairs:
                train_pair_length = self._raw_ptree_count([train_pair])

                # can we fit this train_pair into the existing job batch?
                if batch_size + train_pair_length <= self.batch_ptrees:
                    # yes => add it to the current job
                    job_batch.append(train_pair)
                    batch_size += train_pair_length
                else:
                    # no => submit the existing job
                    logger.debug(
                        "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
                        job_no, batch_size, len(job_batch), next_alpha)
                    job_no += 1
                    job_queue.put((job_batch, next_alpha))

                    # update the learning rate for the next job
                    if self.min_alpha < next_alpha:
                        if total_examples:
                            # examples-based decay
                            pushed_examples += len(job_batch)
                            progress = 1.0 * pushed_examples / total_examples
                        else:
                            # ptrees-based decay
                            pushed_ptrees += self._raw_ptree_count(job_batch)
                            progress = 1.0 * pushed_ptrees / total_ptrees
                        next_alpha = self.alpha - (self.alpha -
                                                   self.min_alpha) * progress
                        next_alpha = max(self.min_alpha, next_alpha)

                    # add the train_pair that didn't fit as the first item of a new job
                    job_batch, batch_size = [train_pair], train_pair_length

            # add the last job too (may be significantly smaller than batch_ptrees)
            if job_batch:
                logger.debug(
                    "queueing job #%i (%i ptrees, %i train_pairs) at alpha %.05f",
                    job_no, batch_size, len(job_batch), next_alpha)
                job_no += 1
                job_queue.put((job_batch, next_alpha))

            if job_no == 0 and self.train_count == 0:
                logger.warning(
                    "train() called with an empty iterator (if not intended, "
                    "be sure to provide a corpus that offers restartable "
                    "iteration = an iterable).")

            # give the workers heads up that they can finish -- no more work!
            for _ in xrange(self.workers):
                job_queue.put(None)
            logger.debug("job loop exiting, total %i jobs", job_no)

        # buffer ahead only a limited number of jobs.. this is the reason we can't
        # simply use ThreadPool :(
        job_queue = Queue(maxsize=queue_factor * self.workers)
        progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers)

        workers = [
            threading.Thread(target=worker_loop) for _ in xrange(self.workers)
        ]
        unfinished_worker_count = len(workers)
        workers.append(threading.Thread(target=job_producer))

        for thread in workers:
            thread.daemon = True  # make interrupting the process with ctrl+c easier
            thread.start()

        example_count, trained_ptree_count, raw_ptree_count = 0, 0, ptree_count
        start, next_report = default_timer() - 0.00001, 1.0

        while unfinished_worker_count > 0:
            report = progress_queue.get()  # blocks if workers too slow
            if report is None:  # a thread reporting that it finished
                unfinished_worker_count -= 1
                logger.info(
                    "worker thread finished; awaiting finish of %i more threads",
                    unfinished_worker_count)
                continue
            examples, trained_ptrees, raw_ptrees = report
            job_tally += 1

            # update progress stats
            example_count += examples
            trained_ptree_count += trained_ptrees  # only ptrees in vocab & sampled
            raw_ptree_count += raw_ptrees

            # log progress once every report_delay seconds
            elapsed = default_timer() - start
            if elapsed >= next_report:
                if total_examples:
                    # examples-based progress %
                    logger.info(
                        "PROGRESS: at %.2f%% examples, %.0f ptrees/s, in_qsize %i, out_qsize %i",
                        100.0 * example_count / total_examples,
                        trained_ptree_count / elapsed,
                        gsutils.qsize(job_queue),
                        gsutils.qsize(progress_queue))
                else:
                    # ptrees-based progress %
                    logger.info(
                        "PROGRESS: at %.2f%% ptrees, %.0f ptrees/s, in_qsize %i, out_qsize %i",
                        100.0 * raw_ptree_count / total_ptrees,
                        trained_ptree_count / elapsed,
                        gsutils.qsize(job_queue),
                        gsutils.qsize(progress_queue))
                next_report = elapsed + report_delay

        # all done; report the final stats
        elapsed = default_timer() - start
        logger.info(
            "training on %i raw ptrees (%i effective ptrees) took %.1fs, %.0f effective ptrees/s",
            raw_ptree_count, trained_ptree_count, elapsed,
            trained_ptree_count / elapsed)
        if job_tally < 10 * self.workers:
            logger.warn(
                "under 10 jobs per worker: consider setting a smaller `batch_ptrees' for smoother alpha decay"
            )

        # check that the input corpus hasn't changed during iteration
        if total_examples and total_examples != example_count:
            logger.warn(
                "supplied example count (%i) did not equal expected count (%i)",
                example_count, total_examples)
        if total_ptrees and total_ptrees != raw_ptree_count:
            logger.warn(
                "supplied raw word count (%i) did not equal expected count (%i)",
                raw_ptree_count, total_ptrees)

        self.train_count += 1  # number of times train() has been called
        self.total_train_time += elapsed
        return trained_ptree_count