def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" paragraph_work = zeros(self.paragraph_size, dtype=REAL) # each thread must have its own work memory error = zeros(1, dtype = REAL) if self.concatenate: # word work here is for each individual word, so it has length logistic regression - para size word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL) else: # here word work is aggregated: word_work = zeros(self.layer1_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) zeros(self.logistic_regression_size, dtype = REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job)) with lock: # here we can store the scores for later plotting and viewing... word_count[0] += job_words elapsed = time.time() - start total_error[0] += error[0] if elapsed >= next_report[0]: logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" work = zeros( self.layer1_size, dtype=REAL) # each thread must have its own work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max( self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words) ) if self.weight_decay else self.alpha # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = 0 for sentence in job: job_words += self.training_function( self, sentence, alpha, work) with lock: # here we can store the scores for later plotting and viewing... word_count[0] += job_words elapsed = time.time() - start if elapsed >= next_report[0]: logger.debug( "PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s" % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[ 0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports
def worker_loop(): """Train the model, lifting lists of sentences from the job_queue.""" work = matutils.zeros_aligned( self.layer1_size, dtype=REAL) # per-thread private work memory neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) jobs_processed = 0 while True: job = job_queue.get() if job is None: progress_queue.put(None) break # no more jobs => quit this worker sentences, pairwise, alpha = job tally, raw_tally = self._do_train_job(sentences, pairwise, alpha, (work, neu1)) progress_queue.put( (len(sentences), tally, raw_tally)) # report back progress jobs_processed += 1 logger.debug("worker exiting, processed %i jobs", jobs_processed)
def accuracy(self, questions, restrict_vocab=30000): """ Compute accuracy of the model (with **capitalizations**). `questions` is a filename where lines are 4-tuples of words, split into sections by ": SECTION NAME" lines. See https://code.google.com/p/word2vec/source/browse/trunk/questions-words.txt for an example. The accuracy is reported (=printed to log and returned as a list) for each section separately, plus there's one aggregate summary at the end. Use `restrict_vocab` to ignore all questions containing a word whose frequency is not in the top-N most frequent words (default top 30,000). This method corresponds to the `compute-accuracy` script of the original C word2vec. """ ok_vocab = dict( sorted(self.vocab.items(), key=lambda item: -item[1].count)[:restrict_vocab]) ok_index = set(v.index for v in ok_vocab.values()) def log_accuracy(section): correct, incorrect = section['correct'], section['incorrect'] if correct + incorrect > 0: logger.info( "%s: %.1f%% (%i/%i)" % (section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect)) sections, section = [], None for line_no, line in enumerate(open(questions)): # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed if line.startswith(': '): # a new section starts => store the old section if section: sections.append(section) log_accuracy(section) section = { 'section': line.lstrip(': ').strip(), 'correct': 0, 'incorrect': 0 } else: if not section: raise ValueError( "missing section header before line #%i in %s" % (line_no, questions)) try: a, b, c, expected = line.split( ) # TODO assumes vocabulary preprocessing uses lowercase, too... except: logger.info("skipping invalid line #%i in %s" % (line_no, questions)) if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: logger.debug("skipping line #%i with OOV words: %s" % (line_no, line)) continue ignore = set(self.vocab[v].index for v in [a, b, c]) # indexes of words to ignore predicted = None # find the most likely prediction, ignoring OOV words and input words for index in argsort( self.most_similar(positive=[b, c], negative=[a], topn=False))[::-1]: if index in ok_index and index not in ignore: predicted = self.index2word[index] if predicted != expected and predicted != expected.lower( ): logger.debug("%s: expected %s, predicted %s" % (line.strip(), expected, predicted)) break section['correct' if predicted == expected else 'incorrect'] += 1 if section: # store the last section, too sections.append(section) log_accuracy(section) total = { 'section': 'total', 'correct': sum(s['correct'] for s in sections), 'incorrect': sum(s['incorrect'] for s in sections) } log_accuracy(total) sections.append(total) return sections
def job_producer(): """Fill jobs queue using the input `sentences` iterator.""" job_batch, batch_size = [], 0 pushed_words, pushed_examples = 0, 0 next_alpha = self.alpha if next_alpha > self.min_alpha_yet_reached: logger.warn("Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha job_no = 0 for sent_idx, sentence in enumerate(sentences): sentence_length = self._raw_word_count([sentence]) # can we fit this sentence into the existing job batch? if batch_size + sentence_length <= self.batch_words: # yes => add it to the current job job_batch.append(sentence) batch_size += sentence_length else: # no => submit the existing job pair_idx = list( numpy.random.choice( range(len(self.pairwise_constraints)), int(batch_size * 0.2))) pairwise_samples = [self.pairwise_constraints[x] for x in pair_idx] logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(pairwise_samples), next_alpha) job_no += 1 job_queue.put((job_batch, pairwise_samples, next_alpha)) # update the learning rate for the next job if self.min_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(job_batch) progress = 1.0 * pushed_examples / total_examples else: # words-based decay pushed_words += self._raw_word_count(job_batch) progress = 1.0 * pushed_words / total_words next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress next_alpha = max(self.min_alpha, next_alpha) # add the sentence that didn't fit as the first item of a new job job_batch, batch_size = [sentence], sentence_length # add the last job too (may be significantly smaller than batch_words) if job_batch: logger.debug( "queueing job #%i (%i words, %i sentences, %i constraints) at alpha %.05f", job_no, batch_size, len(job_batch), len(self.pairwise_constraints), next_alpha) job_no += 1 job_queue.put((job_batch, self.pairwise_constraints, next_alpha)) if job_no == 0 and self.train_count == 0: logger.warning( "train() called with an empty iterator (if not intended, " "be sure to provide a corpus that offers restartable " "iteration = an iterable).") # give the workers heads up that they can finish -- no more work! for _ in xrange(self.workers): job_queue.put(None) logger.debug("job loop exiting, total %i jobs", job_no)
import numpy from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL, double, uint32, seterr, array, uint8, vstack, fromstring, sqrt, newaxis, ndarray, empty, sum as np_sum, prod, ones, ascontiguousarray, vstack from six import iteritems, itervalues, string_types from six.moves import xrange from timeit import default_timer from random import shuffle try: from queue import Queue, Empty except ImportError: from Queue import Queue, Empty try: from gensim.models.word2vec_inner import train_batch_sg, train_batch_cbow from gensim.models.word2vec_inner import score_sentence_sg, score_sentence_cbow from gensim.models.word2vec_inner import FAST_VERSION, MAX_WORDS_IN_BATCH logger.debug("Fast version of {0} is being used".format(__name__)) except ImportError: # failed... fall back to plain numpy (20-80x slower training than the above) logger.warning("Slow version of {0} is being used".format(__name__)) FAST_VERSION = -1 MAX_WORDS_IN_BATCH = 10000 def train_batch_sg_constraints(model, constraints, alpha, work=None): """This function adds an additional constraint to the representation.""" result = 0 for constraint in constraints: word = model.vocab[constraint[0]] word2 = model.vocab[constraint[1]] # the representation of word2.index is used to predict model.index2word[word.index]
def train(self, sentences, total_words=None, word_count=0, paragraphs_only = False, vocab = None, paragraphs = None): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of utf8 strings. """ if paragraphs is None: paragraphs = self.synparagraph if vocab is None: vocab = self.paragraph_vocab if not self.vocab: raise RuntimeError("you must first build vocabulary before training the model") start, next_report = time.time(), [1.0] word_count, total_words = [word_count], total_words or sum(v.count for v in itervalues(self.vocab)) jobs = Queue(maxsize=2 * self.workers) # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( lock = threading.Lock() # for shared state (=number of words trained so far, log reports...) total_error = [0.0] def worker_train(): """Train the model, lifting lists of sentences from the jobs queue.""" paragraph_work = zeros(self.paragraph_size, dtype=REAL) # each thread must have its own work memory error = zeros(1, dtype = REAL) if self.concatenate: # word work here is for each individual word, so it has length logistic regression - para size word_work = zeros(self.logistic_regression_size - self.paragraph_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.logistic_regression_size, dtype=REAL) else: # here word work is aggregated: word_work = zeros(self.layer1_size, dtype = REAL) neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) zeros(self.logistic_regression_size, dtype = REAL) while True: job = jobs.get() if job is None: # data finished, exit break # update the learning rate before every job alpha = max(self.min_alpha, self.alpha * (1 - 1.0 * word_count[0] / total_words)) if self.weight_decay else self.alpha # how many words did we train on? out-of-vocabulary (unknown) words do not count job_words = self.training_function(self, job, paragraphs, paragraphs_only, alpha, paragraph_work, word_work, neu1, error, len(job)) with lock: # here we can store the scores for later plotting and viewing... word_count[0] += job_words elapsed = time.time() - start total_error[0] += error[0] if elapsed >= next_report[0]: logger.debug("PROGRESS: at %.2f%% words, alpha %.05f, %.0f words/s," % (100.0 * word_count[0] / total_words, alpha, word_count[0] / elapsed if elapsed else 0.0)) next_report[0] = elapsed + 1.0 # don't flood the log, wait at least a second between progress reports workers = [threading.Thread(target=worker_train) for _ in xrange(self.workers)] for thread in workers: thread.daemon = True # make interrupting the process with ctrl+c easier thread.start() # convert input strings to Vocab objects, and paragraph to paragraph (Vocab) object: no_oov = (self.create_job(sentence,vocab) for sentence in sentences) for job_no, job in enumerate(utils.grouper(no_oov, self.batchsize)): logger.debug("putting job #%i in the queue, qsize=%i" % (job_no, jobs.qsize())) jobs.put(job) logger.info("reached the end of input; waiting to finish %i outstanding jobs" % jobs.qsize()) for _ in xrange(self.workers): jobs.put(None) # give the workers heads up that they can finish -- no more work! for thread in workers: thread.join() elapsed = time.time() - start logger.info("training on %i sentences took %.1fs, %.0f sentences/s, %.6f" % (word_count[0], elapsed, word_count[0] / elapsed if elapsed else 0.0, total_error[0])) return (word_count[0], total_error[0])