def test_word2vec(): wv = vectorizers.word2vec(wikitext.revision.datasources.words, vectorize_words, name='word vectors') vector = solve(wv, cache={ro.revision.text: 'a bv c d'}) assert len(vector) == 4 assert len(vector[0]) == 300 vector = solve(wv, cache={ro.revision.text: ''}) assert len(vector) == 1 assert len(vector[0]) == 300 assert pickle.loads(pickle.dumps(wv)) == wv
def load_vectorizer(self, enwiki_kvs_path): enwiki_kvs = vectorizers.word2vec.load_gensim_kv( path=enwiki_kvs_path, mmap="r" ) vectorize_words = functools.partial(vectorizers.word2vec.vectorize_words, enwiki_kvs) revision_text_vectors = vectorizers.word2vec( mappers.lower_case(wikitext.revision.datasources.words), vectorize_words, name="revision.text.en_vectors") w2v = aggregators.mean( revision_text_vectors, vector=True, name="revision.text.en_vectors_mean" ) return w2v
from revscoring.datasources.meta import vectorizers, mappers from revscoring.features import wikitext from revscoring.features.meta import aggregators cswiki_kvs = vectorizers.word2vec.load_gensim_kv( filename="cswiki-20191201-learned_vectors.50_cell.100k.kv", mmap='r') def vectorize_words(words): return vectorizers.word2vec.vectorize_words(cswiki_kvs, words) revision_text_vectors = vectorizers.word2vec(mappers.lower_case( wikitext.revision.datasources.words), vectorize_words, name="revision.text.cs_vectors") w2v = aggregators.mean(revision_text_vectors, vector=True, name="revision.text.cs_vectors_mean") drafttopic = [w2v] articletopic = drafttopic
wikidata_kvs = vectorizers.word2vec.load_gensim_kv( filename="wikidata-20200501-learned_vectors.50_cell.10k.kv", mmap="r") def process_claims_to_words(claims): words = [] for pid, value in claims: words.append(pid) if QID_RE.match(value) is not None: words.append(value) return words def vectorize_words(words): return vectorizers.word2vec.vectorize_words(wikidata_kvs, words) claim_words = Datasource("wikidata.revision.claim_words", process_claims_to_words, depends_on=[wikibase.revision.datasources.claims]) revision_claim_words_vectors = vectorizers.word2vec( claim_words, vectorize_words, name="revision.text.wikidata_vectors") w2v = aggregators.mean(revision_claim_words_vectors, vector=True, name="revision.text.wikidata_vectors_mean") articletopic = [w2v]
from revscoring.datasources.meta import vectorizers from revscoring.features.meta import aggregators from revscoring.languages import english google_news_kvs = vectorizers.word2vec.load_kv( filename="GoogleNews-vectors-negative300.bin.gz", limit=150000) def vectorize_words(words): return vectorizers.word2vec.vectorize_words(google_news_kvs, words) revision_text_vectors = vectorizers.word2vec( english.stopwords.revision.datasources.non_stopwords, vectorize_words, name="revision.text.google_news_vectors") w2v = aggregators.mean( revision_text_vectors, vector=True, name="revision.text.google_news_vector_mean" ) drafttopic = [w2v]