示例#1
0
def learn_rc_tensors(words, modifiers, compounds,
                   weight=(lambda c: np.log(c)), which="orc",
                   min_count=2, parameter=80.0, report_freq=20):
    logger.info("regularisation parameter: %s", str(parameter))
    if which == "orc":
        arg1 = 3
        arg2 = 2
        logger.info("learning object relative pronouns")
    else:
        arg1 = 2
        arg2 = 3
        logger.info("learning subject relative pronouns")
    # build LexicalFunctions object
    lfs_counter = Counter((m for _,m,*_ in compounds.index2name))
    lfs_index2name = [m for m,_ in lfs_counter.most_common()]
    vocabsize = len(lfs_index2name)
    dim = words.shape[1]
    arr = np.zeros((vocabsize, dim, dim*dim), dtype=floatX)
    eye = np.eye(dim*dim)
    lfs = LexicalFunctions(arr, lfs_index2name)
    logger.info("initialised a %s x %s x %s tensor embedding tensor",
                si(vocabsize), si(dim), si(dim*dim))

    t = Timer(interval=report_freq)
    # build a list of training examples for each modifier
    t.tic()
    phrase_examples = defaultdict(list)
    for i, z in enumerate(zip(compounds.index2name, compounds.index2count)):
        if z[1] >= min_count and z[0][arg1] in modifiers.name2index:
            phrase_examples[z[0][1]].append((words.name2index[z[0][0]],
                                             modifiers.name2index[z[0][arg1]],
                                             words.name2index[z[0][arg2]],
                                             i, z[1]))
    logger.info("Examples built in "+t.toc(hms=True))

    # solve AX = B for each modifier
    t.tic()
    for ex_num, ex in enumerate(phrase_examples.items()):
        lf_name, examples = ex
        if len(examples) < 1:
            continue
        B = np.zeros((len(examples), dim))
        A = np.zeros((len(examples), dim * dim))
        for i,z in enumerate(examples):
            noun1_index, modifier_index, noun2_index, phrase_index, count = z
            w = weight(count)
            A[i] = w * np.outer(words[noun1_index], np.dot(modifiers[modifier_index],
                                words[noun2_index])).flatten()
            B[i] = w * compounds[phrase_index]
        tmp1 = pinv(np.dot(A.T, A) + parameter*eye)
        tmp2 = np.dot(A.T, B)
        lfs.A[lfs.name2index[lf_name]] = np.dot(tmp1, tmp2).T
        if t.ready():
            t.toc()
            logger.info("%.2f%% matrices (%s, %s)" %
                (100 * (ex_num+1) / vocabsize, si(ex_num+1), t.toc(hms=True)))
    logger.info("learned %s tensors in %s" % (si(vocabsize),
       t.toc(hms=True)))
    return lfs
示例#2
0
文件: word2vec.py 项目: jeanm/nlip
 def load_contexts(self, contexts):
     shape = contexts.A.shape
     if shape[0] != len(contexts.index2name) or shape[1] != self.dim:
         logger.error("vocabulary/contexts shape mismatch")
         raise RuntimeError("vocabulary/contexts shape mismatch")
     self.contexts = contexts.A
     self.index2name = contexts.index2name
     self.index2count = contexts.index2count
     self.name2index = {e:i for i,e in enumerate(self.index2name)}
     logger.info('loaded a %s x %s context matrix', si(len(self.index2name)), si(self.dim))
     self._finalise_vocab()
示例#3
0
文件: word2vec.py 项目: jeanm/nlip
    def _downsample_vocab(self):
        retain_total = sum(self.index2count)
        # Precalculate each vocabulary item's threshold for sampling
        if not self.sample:
            # no words downsampled
            threshold_count = retain_total
        else:
            # set parameter as proportion of total
            threshold_count = self.sample * retain_total

        self.index2sample = []
        downsample_total, downsample_unique = 0, 0
        for w in range(len(self.index2name)):
            v = self.index2count[w]
            word_probability = (np.sqrt(v / threshold_count) + 1) * (threshold_count / v)
            if word_probability < 1.0:
                downsample_unique += 1
                downsample_total += word_probability * v
            else:
                word_probability = 1.0
                downsample_total += v
            self.index2sample.append(int(round(word_probability * 2**32)))

        logger.info("sample=%g downsampled the %s most common words", self.sample, si(downsample_unique))
        logger.info("downsampling will decrease corpus size by approximately %.1f%%",
                    downsample_total * 100.0 / max(retain_total, 1))
示例#4
0
def learn_matrices(words, compounds, weight=(lambda c: np.log(c)),
                   min_count=2, parameter=80, report_freq=20):

    # build LexicalFunctions object
    lfs_counter = Counter((a for a,_ in compounds.index2name))
    lfs_index2name = [a for a,_ in lfs_counter.most_common()]
    vocabsize = len(lfs_index2name)
    dim = words.shape[1]
    arr = np.zeros((vocabsize, dim, dim), dtype=floatX)
    eye = np.eye(dim)
    lfs = LexicalFunctions(arr, lfs_index2name)
    logger.info("initialised a %s x %s x %s matrix embedding tensor",
                si(vocabsize), si(dim), si(dim))

    t = Timer(interval=report_freq)
    # build a list of training examples for each modifier
    t.tic()
    phrase_examples = defaultdict(list)
    for i, z in enumerate(zip(compounds.index2name, compounds.index2count)):
        if z[1] >= min_count:
            phrase_examples[z[0][0]].append((words.name2index[z[0][1]], i, z[1]))
    logger.info("Examples built in "+t.toc(hms=True))

    # solve AX = B for each modifier
    t.tic()
    for ex_num, ex in enumerate(phrase_examples.items()):
        lf_name, examples = ex
        if len(examples) < 1:
            continue
        B = np.zeros((len(examples), dim))
        A = np.zeros((len(examples),dim))
        for i, z in enumerate(examples):
            noun_index, phrase_index, count = z
            w = weight(z[2])
            A[i] = w * words[z[0]]
            B[i] = w * compounds[z[1]]
        tmp1 = pinv(np.dot(A.T, A) + parameter * eye)
        tmp2 = np.dot(A.T, B)
        lfs.A[lfs.name2index[lf_name]] = np.dot(tmp1, tmp2).T
        if t.ready():
            t.toc()
            logger.info("%.2f%% matrices (%s)" %
                (100 * ex_num / vocabsize, si(ex_num+1)))
    logger.info("learned %s matrices in %s" % (si(vocabsize),
       t.toc(hms=True)))
    return lfs
示例#5
0
文件: word2vec.py 项目: jeanm/nlip
 def load_vocab(self, counts_infile):
     with h5py.File(counts_infile, "r") as fin:
         self.index2name = fin["index2name"][:]
         self.index2count = fin["index2count"][:]
     logger.info("loaded a word vocabulary of size %s", si(len(self.index2name)))
     self.name2index = {e:i for i,e in enumerate(self.index2name)}
     self._finalise_vocab()
     # set context vectors to zero
     self.reset_weights()
示例#6
0
文件: word2vec.py 项目: jeanm/nlip
    def train_tuples(self, corpus_infile, counts_infile,
            epochs=1, report_freq=20):
        if len(self.index2sample) == 0:
            logger.error("attempted to start training but vocabulary has not been loaded")
            raise RuntimeError("You must build/load the vocabulary before training the model")
        epochs = int(epochs) or 1
        # count the number of phrase vectors to be learned
        vocabsize = 0
        with h5py.File(counts_infile, "r") as fcount:
            phrase_index2count = fcount["index2count"][:]
            phrase_index2name = fcount["index2name"][:]
            vocabsize = len(phrase_index2count)

        # initialise temporary work memory and phrase vectors
        work = np.zeros(self.dim, dtype=floatX)
        embeddings = np.ascontiguousarray((np.random.rand(vocabsize, self.dim) - 0.5) / self.dim,dtype=floatX)
        logger.info("initialised a %s x %s phrase embedding matrix", si(vocabsize), si(self.dim))

        with smart_open(corpus_infile, 'r') as fin:
            total_words = 0
            # read the number of sentences in the corpus
            corpus_sentences = int(next(fin).strip())
            total_sentences = epochs * corpus_sentences
            logger.info("loaded corpus with %s examples, training for %d epochs", si(corpus_sentences), epochs)

            t = Timer(interval=report_freq)
            t.tic()
            word_count = 0
            for epoch in range(epochs):
                fin.seek(0)
                next(fin) # skip first line with number of sentences
                for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences):
                    sentence = list(map(int,line.strip().split()))[:self.window+1]
                    if len(sentence) <= 1: continue
                    alpha = self.alpha * (1 - sentence_num / total_sentences)
                    word_count += len(sentence)-1
                    train_tuple(self, sentence, alpha, embeddings, work)
                    if t.ready():
                        t.toc()
                        logger.info("%.2f%% examples @ %s words/s, alpha %.6f" %
                            (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha))
                        total_words += word_count
                        word_count = 0
                total_words += word_count
        logger.info("trained on %s words (%s examples) in %s @ %s words/s" %
                (si(total_words), si(total_sentences), t.toc(hms=True),
                    si(total_words / t.toc())))
        return Embeddings(embeddings, phrase_index2name, phrase_index2count)
示例#7
0
文件: word2vec.py 项目: jeanm/nlip
    def train_sentences(self, corpus_infile, epochs=1, report_freq=20):
        if len(self.index2sample) == 0:
            logger.error("attempted to start training but vocabulary has not been loaded")
            raise RuntimeError("You must build/load the vocabulary before training the model")
        epochs = int(epochs) or 1
        # initialise temporary work memory and word vectors
        work = np.zeros(self.dim, dtype=floatX)
        embeddings = np.ascontiguousarray((np.random.rand(len(self.index2name), self.dim) - 0.5) / self.dim,dtype=floatX)
        logger.info("initialised a %s x %s embedding matrix", si(len(self.index2name)), si(self.dim))
        with smart_open(corpus_infile, 'r') as fin:
            total_words = 0
            # read the number of sentences in the corpus
            corpus_sentences = int(next(fin).strip())
            total_sentences = epochs * corpus_sentences
            logger.info("loaded corpus with %s sentences, training for %d epochs", si(corpus_sentences), epochs)

            t = Timer(interval=report_freq)
            t.tic()
            word_count = 0
            for epoch in range(epochs):
                fin.seek(0)
                next(fin) # skip first line with number of sentences
                for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences):
                    alpha = self.alpha * (1 - sentence_num / total_sentences)
                    sentence = list(map(int,line.strip().split()))
                    word_count += len(sentence)
                    train_sentence(self, sentence, alpha, embeddings, work)
                    if t.ready():
                        t.toc()
                        if self.dev:
                            cor = self.test_dev(embeddings)
                            logger.info("%.2f%% sentences @ %s words/s, alpha %.6f, corr %.5f (p %.2e)" %
                                (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha, cor[0], cor[1]))
                        else:
                            logger.info("%.2f%% sentences @ %s words/s, alpha %.6f" %
                                (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha))
                        total_words += word_count
                        word_count = 0
                total_words += word_count
        logger.info("trained on %s sentences (%s words) in %s @ %s words/s" %
                (si(total_sentences), si(total_words), t.toc(hms=True),
                    si(total_words / t.toc())))
        cor = self.test_dev(embeddings)
        logger.info("correlation on development set %.5f (p %.2e)" % cor)
        return Embeddings(embeddings, self.index2name, self.index2count)
示例#8
0
文件: word2vec.py 项目: jeanm/nlip
 def reset_weights(self, what=None):
     vocabsize = len(self.index2name)
     self.contexts = np.zeros((vocabsize, self.dim), dtype=floatX, order='C')
     logger.info('initialised a %s x %s context matrix', si(vocabsize), si(self.dim))