Пример #1
0
def test_mean_vectors():
    my_list = Datasource("my_list")
    my_mean = aggregators.mean(my_list, vector=True)
    cache = {my_list: [[1, 2, 3], [4, 5, 6]]}
    assert all(a == b
               for a, b in zip(solve(my_mean, cache=cache), [2.5, 3.5, 4.5]))
    cache = {my_list: [[]]}
    assert solve(my_mean, cache=cache) == [0]
    cache = {my_list: [None]}
    assert solve(my_mean, cache=cache) == [0]

    assert pickle.loads(pickle.dumps(my_mean)) == my_mean
    def load_vectorizer(self, enwiki_kvs_path):
        enwiki_kvs = vectorizers.word2vec.load_gensim_kv(
            path=enwiki_kvs_path,
            mmap="r"
        )

        vectorize_words = functools.partial(vectorizers.word2vec.vectorize_words, enwiki_kvs)

        revision_text_vectors = vectorizers.word2vec(
            mappers.lower_case(wikitext.revision.datasources.words),
            vectorize_words,
            name="revision.text.en_vectors")

        w2v = aggregators.mean(
            revision_text_vectors,
            vector=True,
            name="revision.text.en_vectors_mean"
        )

        return w2v
Пример #3
0
from revscoring.datasources.meta import vectorizers, mappers
from revscoring.features import wikitext
from revscoring.features.meta import aggregators

cswiki_kvs = vectorizers.word2vec.load_gensim_kv(
    filename="cswiki-20191201-learned_vectors.50_cell.100k.kv", mmap='r')


def vectorize_words(words):
    return vectorizers.word2vec.vectorize_words(cswiki_kvs, words)


revision_text_vectors = vectorizers.word2vec(mappers.lower_case(
    wikitext.revision.datasources.words),
                                             vectorize_words,
                                             name="revision.text.cs_vectors")

w2v = aggregators.mean(revision_text_vectors,
                       vector=True,
                       name="revision.text.cs_vectors_mean")

drafttopic = [w2v]
articletopic = drafttopic
Пример #4
0
wikidata_kvs = vectorizers.word2vec.load_gensim_kv(
    filename="wikidata-20200501-learned_vectors.50_cell.10k.kv", mmap="r")


def process_claims_to_words(claims):
    words = []
    for pid, value in claims:
        words.append(pid)
        if QID_RE.match(value) is not None:
            words.append(value)
    return words


def vectorize_words(words):
    return vectorizers.word2vec.vectorize_words(wikidata_kvs, words)


claim_words = Datasource("wikidata.revision.claim_words",
                         process_claims_to_words,
                         depends_on=[wikibase.revision.datasources.claims])

revision_claim_words_vectors = vectorizers.word2vec(
    claim_words, vectorize_words, name="revision.text.wikidata_vectors")

w2v = aggregators.mean(revision_claim_words_vectors,
                       vector=True,
                       name="revision.text.wikidata_vectors_mean")

articletopic = [w2v]
Пример #5
0
from revscoring.datasources.meta import vectorizers
from revscoring.features.meta import aggregators
from revscoring.languages import english


google_news_kvs = vectorizers.word2vec.load_kv(
    filename="GoogleNews-vectors-negative300.bin.gz", limit=150000)


def vectorize_words(words):
    return vectorizers.word2vec.vectorize_words(google_news_kvs, words)


revision_text_vectors = vectorizers.word2vec(
    english.stopwords.revision.datasources.non_stopwords,
    vectorize_words,
    name="revision.text.google_news_vectors")

w2v = aggregators.mean(
    revision_text_vectors,
    vector=True,
    name="revision.text.google_news_vector_mean"
)

drafttopic = [w2v]