def add_new_labels(sentences, model): """ Add new labels (for new docs) to the doc2vec model's `self.vocab`. from: <https://gist.github.com/zseder/4201551d7f8608f0b82b> """ sentence_no = -1 total_words = 0 vocab = model.vocab #model_sentence_n = len([l for l in vocab if l.startswith("SENT")]) model_sentence_n = max(int(l.split('_')[-1]) for l in vocab if l.startswith("SENT")) n_sentences = 0 for sentence_no, sentence in enumerate(sentences): sentence_length = len(sentence.words) for label in sentence.labels: label_e = label.split("_") label_n = int(label_e[1]) + model_sentence_n label = "{0}_{1}".format(label_e[0], label_n) total_words += 1 if label in vocab: vocab[label].count += sentence_length else: vocab[label] = Vocab(count=sentence_length) vocab[label].index = len(model.vocab) - 1 vocab[label].code = [0] vocab[label].sample_probability = 1. model.index2word.append(label) n_sentences += 1 return n_sentences
def build_vocab(self, sentences): logger.info("collecting all words and their counts") vocab = self._vocab_from_new(sentences) # assign a unique index to each word self.vocab, self.index2word = {}, [] for meta_word in [ self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab ]: v = Vocab(count=1) v.index = len(self.vocab) v.sample_probability = 1.0 self.index2word.append(meta_word) self.vocab[meta_word] = v # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1 # actually, not remove any words # build self.vocab word->Vocab dict, and assign a unique index to each word for subgram, v in iteritems(vocab): if v.count >= self.min_count: v.sample_probability = 1.0 v.index = len(self.vocab) self.index2word.append(subgram) self.vocab[subgram] = v logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) logger.info('reset weights') if self.hybrid_pred: # v is word # get single character word frequency freq_list = [ self.vocab[v].count for v in self.vocab if len(v) == 1 ] freq_list.sort(reverse=True) self.hybrid_threshold = freq_list[len(freq_list) / 25] print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold self.reset_weights()
def build_vocab(self, sentences): logger.info("collecting all words and their counts") vocab = self._vocab_from_new(sentences) # assign a unique index to each word self.vocab, self.index2word = {}, [] for meta_word in [self.label0_as_vocab, self.label1_as_vocab, self.unknown_as_vocab]: v = Vocab(count=1) v.index = len(self.vocab) v.sample_probability = 1.0 self.index2word.append(meta_word) self.vocab[meta_word] = v # remove word with count < min_count, default min_count = 5 in gensim, Seger changed to 1 # actually, not remove any words # build self.vocab word->Vocab dict, and assign a unique index to each word for subgram, v in iteritems(vocab): if v.count >= self.min_count: v.sample_probability = 1.0 v.index = len(self.vocab) self.index2word.append(subgram) self.vocab[subgram] = v logger.info("total %i word types after removing those with count<%s" % (len(self.vocab), self.min_count)) logger.info('reset weights') if self.hybrid_pred: # v is word # get single character word frequency freq_list = [self.vocab[v].count for v in self.vocab if len(v) == 1] freq_list.sort(reverse=True) self.hybrid_threshold = freq_list[len(freq_list) / 25] print '>frequencey threshold for hybrid prediction is:', self.hybrid_threshold self.reset_weights()