예제 #1
0
파일: evaluation.py 프로젝트: jeanm/nlip
def similarity(arg1, test_infile):
    with smart_open(test_infile, "r") as f:
        test = json.load(f)
    gold = np.array([float(x[2]) for x in test])
    # we're given a tuple: matrix-vector composition
    if isinstance(arg1,tuple):
        if len(arg1) == 2:
            lf = arg1[0]
            emb = arg1[1]
            # agumented matrices
            if lf.A.shape[2] == lf.A.shape[1]+1:
                ours = np.array([1-cosine(
                    np.dot(lf.word(x[0][0]),np.hstack((emb.word(x[0][1]),[1]))),
                    np.dot(lf.word(x[1][0]),np.hstack((emb.word(x[1][1]),[1])))) for x in test])
            # standard matrices
            else:
                ours = np.array([1-cosine(
                    np.dot(lf.word(x[0][0]),emb.word(x[0][1])),
                    np.dot(lf.word(x[1][0]),emb.word(x[1][1]))) for x in test])
            return spearmanr(gold,ours)
        return TypeError("Invalid input format")
    # we're only given embeddings: do cosine similarity of vectors
    elif isinstance(arg1,Embeddings):
        ours = np.array([1-cosine(arg1.word(x[0]),arg1.word(x[1])) for x in test])
        return spearmanr(gold,ours)
    return TypeError("Invalid input format")
예제 #2
0
파일: word2vec.py 프로젝트: jeanm/nlip
    def train_tuples(self, corpus_infile, counts_infile,
            epochs=1, report_freq=20):
        if len(self.index2sample) == 0:
            logger.error("attempted to start training but vocabulary has not been loaded")
            raise RuntimeError("You must build/load the vocabulary before training the model")
        epochs = int(epochs) or 1
        # count the number of phrase vectors to be learned
        vocabsize = 0
        with h5py.File(counts_infile, "r") as fcount:
            phrase_index2count = fcount["index2count"][:]
            phrase_index2name = fcount["index2name"][:]
            vocabsize = len(phrase_index2count)

        # initialise temporary work memory and phrase vectors
        work = np.zeros(self.dim, dtype=floatX)
        embeddings = np.ascontiguousarray((np.random.rand(vocabsize, self.dim) - 0.5) / self.dim,dtype=floatX)
        logger.info("initialised a %s x %s phrase embedding matrix", si(vocabsize), si(self.dim))

        with smart_open(corpus_infile, 'r') as fin:
            total_words = 0
            # read the number of sentences in the corpus
            corpus_sentences = int(next(fin).strip())
            total_sentences = epochs * corpus_sentences
            logger.info("loaded corpus with %s examples, training for %d epochs", si(corpus_sentences), epochs)

            t = Timer(interval=report_freq)
            t.tic()
            word_count = 0
            for epoch in range(epochs):
                fin.seek(0)
                next(fin) # skip first line with number of sentences
                for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences):
                    sentence = list(map(int,line.strip().split()))[:self.window+1]
                    if len(sentence) <= 1: continue
                    alpha = self.alpha * (1 - sentence_num / total_sentences)
                    word_count += len(sentence)-1
                    train_tuple(self, sentence, alpha, embeddings, work)
                    if t.ready():
                        t.toc()
                        logger.info("%.2f%% examples @ %s words/s, alpha %.6f" %
                            (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha))
                        total_words += word_count
                        word_count = 0
                total_words += word_count
        logger.info("trained on %s words (%s examples) in %s @ %s words/s" %
                (si(total_words), si(total_sentences), t.toc(hms=True),
                    si(total_words / t.toc())))
        return Embeddings(embeddings, phrase_index2name, phrase_index2count)
예제 #3
0
파일: word2vec.py 프로젝트: jeanm/nlip
    def train_sentences(self, corpus_infile, epochs=1, report_freq=20):
        if len(self.index2sample) == 0:
            logger.error("attempted to start training but vocabulary has not been loaded")
            raise RuntimeError("You must build/load the vocabulary before training the model")
        epochs = int(epochs) or 1
        # initialise temporary work memory and word vectors
        work = np.zeros(self.dim, dtype=floatX)
        embeddings = np.ascontiguousarray((np.random.rand(len(self.index2name), self.dim) - 0.5) / self.dim,dtype=floatX)
        logger.info("initialised a %s x %s embedding matrix", si(len(self.index2name)), si(self.dim))
        with smart_open(corpus_infile, 'r') as fin:
            total_words = 0
            # read the number of sentences in the corpus
            corpus_sentences = int(next(fin).strip())
            total_sentences = epochs * corpus_sentences
            logger.info("loaded corpus with %s sentences, training for %d epochs", si(corpus_sentences), epochs)

            t = Timer(interval=report_freq)
            t.tic()
            word_count = 0
            for epoch in range(epochs):
                fin.seek(0)
                next(fin) # skip first line with number of sentences
                for sentence_num, line in enumerate(fin,start=epoch*corpus_sentences):
                    alpha = self.alpha * (1 - sentence_num / total_sentences)
                    sentence = list(map(int,line.strip().split()))
                    word_count += len(sentence)
                    train_sentence(self, sentence, alpha, embeddings, work)
                    if t.ready():
                        t.toc()
                        if self.dev:
                            cor = self.test_dev(embeddings)
                            logger.info("%.2f%% sentences @ %s words/s, alpha %.6f, corr %.5f (p %.2e)" %
                                (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha, cor[0], cor[1]))
                        else:
                            logger.info("%.2f%% sentences @ %s words/s, alpha %.6f" %
                                (100 * sentence_num / total_sentences, si(word_count / t.interval), alpha))
                        total_words += word_count
                        word_count = 0
                total_words += word_count
        logger.info("trained on %s sentences (%s words) in %s @ %s words/s" %
                (si(total_sentences), si(total_words), t.toc(hms=True),
                    si(total_words / t.toc())))
        cor = self.test_dev(embeddings)
        logger.info("correlation on development set %.5f (p %.2e)" % cor)
        return Embeddings(embeddings, self.index2name, self.index2count)
예제 #4
0
파일: evaluation.py 프로젝트: jeanm/nlip
def meanap(relpr_s, relpr_o, verb_s, verb_o, noun, test_infile):
    with smart_open(test_infile, "r") as f:
        test = json.load(f)
    # extract target nouns
    target_nouns  = set(t for _,t,*_ in test)
    counts = Counter(t for _,t,*_ in test)
    # compose relative clauses
    relative_clauses = []
    for which, t, n1, relpr, v, n2 in test:
        #relative_clauses.append((t, np.dot(verb_o.word(v), noun.word(n2))))
        if which == "SBJ":
            #relative_clauses.append((t, noun.word(n1)+noun.word(v)+noun.word(n2)))
            #relative_clauses.append((t, noun.word(v)+noun.word(n2)))
            #relative_clauses.append((t, noun.word(n2)))
            #relative_clauses.append((t, noun.word(n1)+np.dot(verb_o.word(v),noun.word(n2))))
            #relative_clauses.append((t, np.dot(relpr_s.word(relpr),
            #    np.outer(noun.word(n1),
            #             noun.word(v)+noun.word(n2)).flatten())))
            relative_clauses.append((t, np.dot(relpr_s.word(relpr), np.outer(noun.word(n1), np.dot(verb_o.word(v),noun.word(n2))).flatten())))
        else:
            #relative_clauses.append((t, noun.word(n1)+noun.word(v)+noun.word(n2)))
            #relative_clauses.append((t, noun.word(v)+noun.word(n2)))
            #relative_clauses.append((t, noun.word(n2)))
            #relative_clauses.append((t, noun.word(n1)+np.dot(verb_s.word(v),noun.word(n2))))
            #relative_clauses.append((t, np.dot(relpr_o.word(relpr),
            #    np.outer(noun.word(n1),
            #             noun.word(v)+noun.word(n2)).flatten())))
            relative_clauses.append((t,np.dot(relpr_o.word(relpr), np.outer(noun.word(n1), np.dot(verb_s.word(v),noun.word(n2))).flatten())))
    scores = []
    for target in target_nouns:
        #print(target)
        predicted = [(t, 1-cosine(noun.word(target),v)) for t,v in relative_clauses]
        predicted.sort(key=lambda x: x[1], reverse=True)
        ap = _ap(target, [t for t,*_ in predicted], counts[target])
        #print((target, [(t.upper(),r) if t == target else (t,r) for t,_,r in predicted]), ap)
        scores.append(ap)
    return np.mean(scores)