def learn_rc_tensors(words, modifiers, compounds, weight=(lambda c: np.log(c)), which="orc", min_count=2, parameter=80.0, report_freq=20): logger.info("regularisation parameter: %s", str(parameter)) if which == "orc": arg1 = 3 arg2 = 2 logger.info("learning object relative pronouns") else: arg1 = 2 arg2 = 3 logger.info("learning subject relative pronouns") # build LexicalFunctions object lfs_counter = Counter((m for _,m,*_ in compounds.index2name)) lfs_index2name = [m for m,_ in lfs_counter.most_common()] vocabsize = len(lfs_index2name) dim = words.shape[1] arr = np.zeros((vocabsize, dim, dim*dim), dtype=floatX) eye = np.eye(dim*dim) lfs = LexicalFunctions(arr, lfs_index2name) logger.info("initialised a %s x %s x %s tensor embedding tensor", si(vocabsize), si(dim), si(dim*dim)) t = Timer(interval=report_freq) # build a list of training examples for each modifier t.tic() phrase_examples = defaultdict(list) for i, z in enumerate(zip(compounds.index2name, compounds.index2count)): if z[1] >= min_count and z[0][arg1] in modifiers.name2index: phrase_examples[z[0][1]].append((words.name2index[z[0][0]], modifiers.name2index[z[0][arg1]], words.name2index[z[0][arg2]], i, z[1])) logger.info("Examples built in "+t.toc(hms=True)) # solve AX = B for each modifier t.tic() for ex_num, ex in enumerate(phrase_examples.items()): lf_name, examples = ex if len(examples) < 1: continue B = np.zeros((len(examples), dim)) A = np.zeros((len(examples), dim * dim)) for i,z in enumerate(examples): noun1_index, modifier_index, noun2_index, phrase_index, count = z w = weight(count) A[i] = w * np.outer(words[noun1_index], np.dot(modifiers[modifier_index], words[noun2_index])).flatten() B[i] = w * compounds[phrase_index] tmp1 = pinv(np.dot(A.T, A) + parameter*eye) tmp2 = np.dot(A.T, B) lfs.A[lfs.name2index[lf_name]] = np.dot(tmp1, tmp2).T if t.ready(): t.toc() logger.info("%.2f%% matrices (%s, %s)" % (100 * (ex_num+1) / vocabsize, si(ex_num+1), t.toc(hms=True))) logger.info("learned %s tensors in %s" % (si(vocabsize), t.toc(hms=True))) return lfs
def load_contexts(self, contexts): shape = contexts.A.shape if shape[0] != len(contexts.index2name) or shape[1] != self.dim: logger.error("vocabulary/contexts shape mismatch") raise RuntimeError("vocabulary/contexts shape mismatch") self.contexts = contexts.A self.index2name = contexts.index2name self.index2count = contexts.index2count self.name2index = {e:i for i,e in enumerate(self.index2name)} logger.info('loaded a %s x %s context matrix', si(len(self.index2name)), si(self.dim)) self._finalise_vocab()
def _downsample_vocab(self): retain_total = sum(self.index2count) # Precalculate each vocabulary item's threshold for sampling if not self.sample: # no words downsampled threshold_count = retain_total else: # set parameter as proportion of total threshold_count = self.sample * retain_total self.index2sample = [] downsample_total, downsample_unique = 0, 0 for w in range(len(self.index2name)): v = self.index2count[w] word_probability = (np.sqrt(v / threshold_count) + 1) * (threshold_count / v) if word_probability < 1.0: downsample_unique += 1 downsample_total += word_probability * v else: word_probability = 1.0 downsample_total += v self.index2sample.append(int(round(word_probability * 2**32))) logger.info("sample=%g downsampled the %s most common words", self.sample, si(downsample_unique)) logger.info("downsampling will decrease corpus size by approximately %.1f%%", downsample_total * 100.0 / max(retain_total, 1))
def learn_matrices(words, compounds, weight=(lambda c: np.log(c)), min_count=2, parameter=80, report_freq=20): # build LexicalFunctions object lfs_counter = Counter((a for a,_ in compounds.index2name)) lfs_index2name = [a for a,_ in lfs_counter.most_common()] vocabsize = len(lfs_index2name) dim = words.shape[1] arr = np.zeros((vocabsize, dim, dim), dtype=floatX) eye = np.eye(dim) lfs = LexicalFunctions(arr, lfs_index2name) logger.info("initialised a %s x %s x %s matrix embedding tensor", si(vocabsize), si(dim), si(dim)) t = Timer(interval=report_freq) # build a list of training examples for each modifier t.tic() phrase_examples = defaultdict(list) for i, z in enumerate(zip(compounds.index2name, compounds.index2count)): if z[1] >= min_count: phrase_examples[z[0][0]].append((words.name2index[z[0][1]], i, z[1])) logger.info("Examples built in "+t.toc(hms=True)) # solve AX = B for each modifier t.tic() for ex_num, ex in enumerate(phrase_examples.items()): lf_name, examples = ex if len(examples) < 1: continue B = np.zeros((len(examples), dim)) A = np.zeros((len(examples),dim)) for i, z in enumerate(examples): noun_index, phrase_index, count = z w = weight(z[2]) A[i] = w * words[z[0]] B[i] = w * compounds[z[1]] tmp1 = pinv(np.dot(A.T, A) + parameter * eye) tmp2 = np.dot(A.T, B) lfs.A[lfs.name2index[lf_name]] = np.dot(tmp1, tmp2).T if t.ready(): t.toc() logger.info("%.2f%% matrices (%s)" % (100 * ex_num / vocabsize, si(ex_num+1))) logger.info("learned %s matrices in %s" % (si(vocabsize), t.toc(hms=True))) return lfs
def load_vocab(self, counts_infile): with h5py.File(counts_infile, "r") as fin: self.index2name = fin["index2name"][:] self.index2count = fin["index2count"][:] logger.info("loaded a word vocabulary of size %s", si(len(self.index2name))) self.name2index = {e:i for i,e in enumerate(self.index2name)} self._finalise_vocab() # set context vectors to zero self.reset_weights()
def train_tuples(self, corpus_infile, counts_infile, epochs=1, report_freq=20): if len(self.index2sample) == 0: logger.error("attempted to start training but vocabulary has not been loaded") raise RuntimeError("You must build/load the vocabulary before training the model") epochs = int(epochs) or 1 # count the number of phrase vectors to be learned vocabsize = 0 with h5py.File(counts_infile, "r") as fcount: phrase_index2count = fcount["index2count"][:] phrase_index2name = fcount["index2name"][:] vocabsize = len(phrase_index2count) # initialise temporary work memory and phrase vectors work = np.zeros(self.dim, dtype=floatX) embeddings = np.ascontiguousarray((np.random.rand(vocabsize, self.dim) - 0.5) / self.dim,dtype=floatX) logger.info("initialised a %s x %s phrase embedding matrix", si(vocabsize), si(self.dim)) with smart_open(corpus_infile, 'r') as fin: total_words = 0 # read the number of sentences in the corpus corpus_sentences = int(next(fin).strip()) total_sentences = epochs * corpus_sentences logger.info("loaded corpus with %s examples, training for %d epochs", si(corpus_sentences), epochs) t = Timer(interval=report_freq) t.tic() word_count = 0 for epoch in range(epochs): fin.seek(0) next(fin) # skip first line with number of sentences for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences): sentence = list(map(int,line.strip().split()))[:self.window+1] if len(sentence) <= 1: continue alpha = self.alpha * (1 - sentence_num / total_sentences) word_count += len(sentence)-1 train_tuple(self, sentence, alpha, embeddings, work) if t.ready(): t.toc() logger.info("%.2f%% examples @ %s words/s, alpha %.6f" % (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha)) total_words += word_count word_count = 0 total_words += word_count logger.info("trained on %s words (%s examples) in %s @ %s words/s" % (si(total_words), si(total_sentences), t.toc(hms=True), si(total_words / t.toc()))) return Embeddings(embeddings, phrase_index2name, phrase_index2count)
def train_sentences(self, corpus_infile, epochs=1, report_freq=20): if len(self.index2sample) == 0: logger.error("attempted to start training but vocabulary has not been loaded") raise RuntimeError("You must build/load the vocabulary before training the model") epochs = int(epochs) or 1 # initialise temporary work memory and word vectors work = np.zeros(self.dim, dtype=floatX) embeddings = np.ascontiguousarray((np.random.rand(len(self.index2name), self.dim) - 0.5) / self.dim,dtype=floatX) logger.info("initialised a %s x %s embedding matrix", si(len(self.index2name)), si(self.dim)) with smart_open(corpus_infile, 'r') as fin: total_words = 0 # read the number of sentences in the corpus corpus_sentences = int(next(fin).strip()) total_sentences = epochs * corpus_sentences logger.info("loaded corpus with %s sentences, training for %d epochs", si(corpus_sentences), epochs) t = Timer(interval=report_freq) t.tic() word_count = 0 for epoch in range(epochs): fin.seek(0) next(fin) # skip first line with number of sentences for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences): alpha = self.alpha * (1 - sentence_num / total_sentences) sentence = list(map(int,line.strip().split())) word_count += len(sentence) train_sentence(self, sentence, alpha, embeddings, work) if t.ready(): t.toc() if self.dev: cor = self.test_dev(embeddings) logger.info("%.2f%% sentences @ %s words/s, alpha %.6f, corr %.5f (p %.2e)" % (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha, cor[0], cor[1])) else: logger.info("%.2f%% sentences @ %s words/s, alpha %.6f" % (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha)) total_words += word_count word_count = 0 total_words += word_count logger.info("trained on %s sentences (%s words) in %s @ %s words/s" % (si(total_sentences), si(total_words), t.toc(hms=True), si(total_words / t.toc()))) cor = self.test_dev(embeddings) logger.info("correlation on development set %.5f (p %.2e)" % cor) return Embeddings(embeddings, self.index2name, self.index2count)
def reset_weights(self, what=None): vocabsize = len(self.index2name) self.contexts = np.zeros((vocabsize, self.dim), dtype=floatX, order='C') logger.info('initialised a %s x %s context matrix', si(vocabsize), si(self.dim))