Пример #1
0
def load_word2vec(
        # model_filename=word2vec_model_dir + "GoogleNews-vectors-negative300.bin.gz",
        # model_type="c_word2vec",
        # compact_name="google300",
        model_filename=word2vec_model_dir + 'en_1000_no_stem/en.model',
        model_type="gensim",
        compact_name="wiki1000"
):
    """
    :param model_filename:
    :param model_type: can be "c_word2vec" or "gensim"
    :return:
    """
    print >> sys.stderr, "loading word2vec model ", model_filename, \
        "(may take a few minutes) ..."
    start_time = time.time()
    if "c_word2vec" == model_type:
        model = Word2Vec.load_word2vec_format(model_filename, binary=True)
    elif "gensim" == model_type:
        model = Word2Vec.load(model_filename)
    else:
        raise ValueError(sys.stderr, "The specified model_type '"
                         + str(model_type) + "' is not matched!")
    model.compact_name = compact_name
    elapsed = time.time() - start_time
    print >> sys.stderr, "word2vec model loading finished.", elapsed, "s"
    return model
Пример #2
0
    def getEmbeddingsByTerm(self,
                            dim=50,
                            win=1,
                            pretrained_file=None,
                            binary=False,
                            word_cleaning=False,
                            op='doc'):  #op={doc, avg, sum}
        d2v = []
        if op is 'doc':  #doc2vec
            d2v_model = TextCorpus._getDoc2Vec(self._word_count.toarray(), dim,
                                               win)
            # w2v = []
            # for i in xrange(len(d2v_model.vocab)):
            #     w2v.append(d2v_model[str(1)])
            for i in xrange(len(d2v_model.docvecs.doctags)):
                d2v.append(d2v_model.docvecs[str(i)])
        else:  #word2vec and then apply op on words of each doc
            w2v = None
            if pretrained_file is not None:
                w2v = Word2Vec.load_word2vec_format(pretrained_file,
                                                    binary=binary)
            else:
                w2v = Word2Vec(TextCorpus._getDocsByBagOfTokenIds(
                    self._word_count.toarray()),
                               size=dim,
                               window=win,
                               min_count=0,
                               workers=multiprocessing.cpu_count())

            if op == 'sum':
                func = np.sum
            elif op == 'avg':
                func = np.average
            if word_cleaning:  #shouldn't be used when there is no pretrained!
                np.apply_along_axis(lambda x: d2v.append(
                    func([
                        w2v[re.sub("[^a-zA-Z]", " ", self.inv_words[i]).strip(
                        )] for i in x.nonzero()[0] if w2v.__contains__(
                            re.sub("[^a-zA-Z]", " ", self.inv_words[i]).strip(
                            ))
                    ],
                         axis=0)
                    if len(x.nonzero()[0]) > 0 else np.zeros(dim)),
                                    arr=self._word_count.toarray(),
                                    axis=1)
            else:
                np.apply_along_axis(lambda x: d2v.append(
                    func([
                        w2v[str(i)] for i in x.nonzero()[0]
                        if w2v.__contains__(str(i))
                    ],
                         axis=0)
                    if len(x.nonzero()[0]) > 0 else np.zeros(dim)),
                                    arr=self._word_count.toarray(),
                                    axis=1)
        return np.array(d2v)
Пример #3
0
def trainWord2Vec(doc_list=None,
                  buildvoc=1,
                  passes=20,
                  sg=1,
                  size=100,
                  dm_mean=0,
                  window=5,
                  hs=1,
                  negative=5,
                  min_count=1,
                  workers=4):
    model = Word2Vec(size=size,
                     sg=sg,
                     window=window,
                     hs=hs,
                     negative=negative,
                     min_count=min_count,
                     workers=workers)

    if buildvoc == 1:
        print('Building Vocabulary')
        model.build_vocab(doc_list)  # build vocabulate with words + nodeID

    for epoch in range(passes):
        print('Iteration %d ....' % epoch)
        shuffle(doc_list)  # shuffling gets best results

        model.train(doc_list)

    return model
Пример #4
0
 def getEmbeddingsByChar(self, dim=50, win=1, pre=None, op='doc'):  #avg/sum
     d2v = []
     if op is 'doc':  #doc2vec
         d2v_model = TextCorpus._getDoc2Vec(self._char_count.toarray(), dim,
                                            win)
         for i in xrange(len(d2v_model.docvecs.doctags)):
             d2v.append(d2v_model.docvecs[str(i)])
         return np.array(d2v)  #, np.array(w2v)
     else:
         w2v = Word2Vec(TextCorpus._getDocsByBagOfTokenIds(
             self._char_count.toarray()),
                        size=dim,
                        window=win,
                        min_count=0,
                        workers=multiprocessing.cpu_count())
         if op == 'sum':
             func = np.sum
         elif op == 'avg':
             func = np.average
         np.apply_along_axis(lambda x: d2v.append(
             func([
                 w2v[str(i)] for i in x.nonzero()[0]
                 if w2v.__contains__(str(i))
             ],
                  axis=0) if len(x.nonzero()[0]) > 0 else np.zeros(dim)),
                             arr=self._char_count.toarray(),
                             axis=1)
     return np.array(d2v)
Пример #5
0
def train_model(doc_path, output_path, dim=100):
    """
    Training a model.
    Reading the file, building a vocabulary, training, and saving the model

    Args:
        doc_path - str: path to a doc file
        output_path - str: path to the model
    """
    print "Reading a file ..."
    sentences = read_file(doc_path)
    print "Training a model ..."
    model = Word2Vec(sentences, min_count=0, size=dim, window=10)
    print "Saving the moles ..."
    model.save(output_path)
    print "Done."
def eval_model():
    w2v = Word2Vec.load_word2vec_format(args.save_path, binary=False)
    word2id = dict([(w, i) for i, w in enumerate(w2v.index2word)])
    analogy_questions = read_analogies(args.eval_data, word2id)
    correct = 0
    total = len(analogy_questions)
    for question in analogy_questions:
        a, b, c, d = question  # E.g. [Athens, Greece, Baghdad, Iraq]
        analogies = w2v.most_similar(positive=[b, c], negative=[a], topn=4)
        for analogy in analogies:
            word, _ = analogy
            if d == word:
                # Predicted Correctly!
                correct += 1
                break
    print('Eval %4d/%d accuracy = %4.1f%%' % (correct, total, correct * 100.0 / total))
Пример #7
0
    def train(self, size=500, min_count=3, iter=4, window=6, workers=3, **kwargs):
        """Train an embedding model, build a lookup table and model metadata. After training, they will be saved to S3.

        Args:
            kwargs: all arguments that gensim.models.doc2vec.Docvec will take.
        """
        job_postings_generator = job_postings_chain(self.s3_conn, self.quarters, self.jp_s3_path, source=self.source)

        if self.model_type == 'word2vec':
            if not self._model:
                model = Word2Vec(size=size, min_count=min_count, iter=iter, window=window, workers=workers, **kwargs)
            else:
                logging.info("Model existed")
                model = self._model
                self.update = True

            batch_iter = 1
            batch_gen = batches_generator(Word2VecGensimCorpusCreator(job_postings_generator), self.batch_size)
            for batch in batch_gen:
                batch = Reiterable(batch)
                logging.info("Training batch #{} ".format(batch_iter))
                if not self.update:
                    model.build_vocab(batch, update=False)
                    self.update = True
                else:
                    model.build_vocab(batch, update=True)

                model.train(batch, total_examples=model.corpus_count, epochs=model.iter)
                self.vocab_size_cumu.append(len(model.wv.vocab))
                batch_iter += 1
                logging.info('\n')

        elif self.model_type == 'doc2vec':
            model = Doc2Vec(size=size, min_count=min_count, iter=iter, window=window, workers=workers, **kwargs)
            corpus_gen = Doc2VecGensimCorpusCreator(job_postings_generator)
            reiter_corpus_gen = Reiterable(corpus_gen)
            model.build_vocab(reiter_corpus_gen)
            model.train(reiter_corpus_gen, total_examples=model.corpus_count, epochs=model.iter)
            self._lookup = corpus_gen.lookup

        self._model = model
        self._upload()
Пример #8
0
f.write("\n")


# In[17]:

vectors = cbow.get_weights()[0]

### 学習で得られた単語の特徴ベクトルを書き込む
for word, i in tokenizer.word_index.items():
    f.write(word)
    f.write(" ")
    f.write(" ".join(map(str, list(vectors[i,:]))))
    f.write("\n")
f.close()


# In[18]:

w2v = Word2Vec.load_word2vec_format('./vectors.txt', binary=False)


# In[19]:

w2v.most_similar(positive=['alice'])


# In[ ]:



Пример #9
0
# In[16]:

### 書き込み用ファイルを生成
f = open("vectors.txt", "w")

### 語彙数と特徴ベクトルの次元数を書き込む
f.write(" ".join([str(V - 1), str(dim)]))
f.write("\n")

# In[17]:

vectors = cbow.get_weights()[0]

### 学習で得られた単語の特徴ベクトルを書き込む
for word, i in tokenizer.word_index.items():
    f.write(word)
    f.write(" ")
    f.write(" ".join(map(str, list(vectors[i, :]))))
    f.write("\n")
f.close()

# In[18]:

w2v = Word2Vec.load_word2vec_format('./vectors.txt', binary=False)

# In[19]:

w2v.most_similar(positive=['alice'])

# In[ ]:
revvocab = [i + 4 for i, x in enumerate(vocab)]
# print vocab

train_datax = [i for i, x in enumerate(vocab[:top_words])]
train_datay = [langdict[i] for i in vocab[:top_words]]
test_datax = [i for i, x in enumerate(vocab[:500])]
test_datay = [langdict[i] for i in vocab[:500]]

# vocab = train_data
totaldata = []
for line in data:
    x = line.split('\t')
    if (len(x) == 3):
        y = char_ngram_generator(x[0])
        totaldata.append(y)
w2vmodel = Word2Vec(totaldata, min_count=1)
vectdict = {}
for i in totaldata:
    newlist = [j for j in w2vmodel[i[0]]]
    for j in i[1:]:
        for k in range(len(w2vmodel[j])):
            newlist[k] += w2vmodel[j][k]
    vectdict[i[-1]] = newlist
train_datax = [vectdict[x] for i, x in enumerate(vocab[:top_words])]
train_datay = [langdict[i] for i in vocab[:top_words]]
test_datax = [vectdict[x] for i, x in enumerate(vocab[:500])]
test_datay = [langdict[i] for i in vocab[:500]]

print train_datax[0]
# create the model
model = Sequential()
Пример #11
0
def compute_distance_features():

    # Load data
    train = pd.read_csv("./data/train.csv",
                        names=[
                            'row_ID', 'text_a_ID', 'text_b_ID', 'text_a_text',
                            'text_b_text', 'have_same_meaning'
                        ],
                        index_col=0)
    test = pd.read_csv("./data/test.csv",
                       names=[
                           'row_ID', 'text_a_ID', 'text_b_ID', 'text_a_text',
                           'text_b_text', 'have_same_meaning'
                       ],
                       index_col=0)
    en_stop = set(stopwords.words('english'))
    glove_file = "./data/glove.840B.300d.w2vformat.txt"

    def clean(q):
        # Adapted from https://github.com/aerdem4/kaggle-quora-dup
        q = str(q).lower()
        q = q.replace(",000,000", "m").replace(",000", "k").replace(
            "′", "'").replace("’", "'").replace("won't", "will not").replace(
                "cannot", "can not").replace("can't", "can not").replace(
                    "n't", " not").replace("what's", "what is").replace(
                        "it's", "it is").replace("'ve", " have").replace(
                            "i'm", "i am").replace("'re", " are").replace(
                                "he's",
                                "he is").replace("she's", "she is").replace(
                                    "'s",
                                    " own").replace("%", " percent ").replace(
                                        "₹", " rupee ").replace(
                                            "$", " dollar ").replace(
                                                "€", " euro ").replace(
                                                    "'ll", " will")
        q = re.sub(r"([0-9]+)000000", r"\1m", q)
        q = re.sub(r"([0-9]+)000", r"\1k", q)
        return q

    # Start computation
    all_questions = pd.concat([
        train["text_a_text"], train["text_b_text"], test["text_a_text"],
        test["text_b_text"]
    ])
    question_counts = all_questions.value_counts()

    questions = [clean(q) for q in all_questions]
    questions_token = [[w for w in q.split(' ') if w not in en_stop]
                       for q in questions]

    print("Fit TFIDF Model...")
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    tfidf_vectorizer.fit(all_questions)

    print("Load Glove Model...")
    glove_model = KeyedVectors.load_word2vec_format(glove_file)

    print("Fit Word2Vec Model...")
    word2Vec = Word2Vec(size=100, window=5, min_count=2, sg=1, workers=10)
    word2Vec.build_vocab(questions_token)  # prepare the model vocabulary
    word2Vec.train(sentences=questions_token,
                   total_examples=len(questions_token),
                   epochs=word2Vec.iter)

    print("Fit LSI Model...")
    dictionary = corpora.Dictionary(questions_token)
    corpus = [dictionary.doc2bow(text) for text in questions_token]
    lsi = LsiModel(corpus, num_topics=200, id2word=dictionary)

    print("Fit doc2vec Model...")
    q2idx_dict = {tuple(q): idx for idx, q in enumerate(questions_token)}

    d2v_training_data = []
    for idx, doc in enumerate(questions_token):
        d2v_training_data.append(TaggedDocument(doc, [idx]))

    d2v_dm = Doc2Vec(d2v_training_data,
                     size=100,
                     window=4,
                     min_count=3,
                     workers=16,
                     iter=5)
    d2v_dm.delete_temporary_training_data(keep_doctags_vectors=True,
                                          keep_inference=True)

    d2v_bow = Doc2Vec(d2v_training_data,
                      size=100,
                      window=4,
                      min_count=3,
                      dm=0,
                      workers=16,
                      iter=5)
    d2v_bow.delete_temporary_training_data(keep_doctags_vectors=True,
                                           keep_inference=True)

    def preprocess(df):
        df_features = pd.DataFrame(index=df.index)
        df_intermediate = pd.DataFrame(index=df.index)

        print("--> Compute tokens...")
        df_intermediate["clean_a"] = df.text_a_text.apply(lambda x: clean(x))
        df_intermediate["clean_b"] = df.text_b_text.apply(lambda x: clean(x))

        df_intermediate["words_a"] = df_intermediate.apply(
            lambda row: row.clean_a.split(" "), axis=1)
        df_intermediate["words_b"] = df_intermediate.apply(
            lambda row: row.clean_b.split(" "), axis=1)

        df_intermediate["words_clean_a"] = df_intermediate.apply(
            lambda row: [w for w in row.words_a if w not in en_stop], axis=1)
        df_intermediate["words_clean_b"] = df_intermediate.apply(
            lambda row: [w for w in row.words_b if w not in en_stop], axis=1)

        df_intermediate["stop_a"] = df_intermediate.apply(
            lambda row: [w for w in row.words_a if w in en_stop], axis=1)
        df_intermediate["stop_b"] = df_intermediate.apply(
            lambda row: [w for w in row.words_b if w in en_stop], axis=1)

        print("--> Compute tfidf distance...")
        tfidf_a = tfidf_vectorizer.transform(df_intermediate["clean_a"])
        tfidf_b = tfidf_vectorizer.transform(df_intermediate["clean_b"])

        df_features["tfidf_dist_cosine"] = paired_cosine_distances(
            tfidf_a, tfidf_b)
        df_features["tfidf_dist_euclidean"] = paired_euclidean_distances(
            tfidf_a, tfidf_b)

        print("--> Compute glove distance...")
        glove_emb_a = df_intermediate["words_clean_a"].apply(
            lambda q: np.array(
                [glove_model.wv[w] for w in q if w in glove_model.wv]))
        glove_emb_b = df_intermediate["words_clean_b"].apply(
            lambda q: np.array(
                [glove_model.wv[w] for w in q if w in glove_model.wv]))

        glove_emb_a[glove_emb_a.apply(lambda x: len(x) == 0)] = glove_emb_a[
            glove_emb_a.apply(lambda x: len(x) == 0)].apply(lambda y: np.zeros(
                (1, 300)))
        glove_emb_b[glove_emb_b.apply(lambda x: len(x) == 0)] = glove_emb_b[
            glove_emb_b.apply(lambda x: len(x) == 0)].apply(lambda y: np.zeros(
                (1, 300)))
        glove_emb_a = glove_emb_a.apply(lambda x: np.mean(x, axis=0))
        glove_emb_b = glove_emb_b.apply(lambda x: np.mean(x, axis=0))
        glove_emb_a = np.vstack(glove_emb_a.values)
        glove_emb_b = np.vstack(glove_emb_b.values)

        df_features["glove_dist_cosine"] = paired_cosine_distances(
            glove_emb_a, glove_emb_b)
        df_features["glove_dist_euclidean"] = paired_euclidean_distances(
            glove_emb_a, glove_emb_b)
        df_features["glove_word_mover_dist"] = df_intermediate.apply(
            lambda row: glove_model.wv.wmdistance(row["words_clean_a"], row[
                "words_clean_b"]),
            axis=1)

        print("--> Compute lsi distance...")
        lsi_emb_a = df_intermediate["words_clean_a"].apply(
            lambda x: np.array(lsi[dictionary.doc2bow(x)]))
        lsi_emb_b = df_intermediate["words_clean_b"].apply(
            lambda x: np.array(lsi[dictionary.doc2bow(x)]))

        lsi_emb_a[lsi_emb_a.apply(lambda x: len(x) == 0 or x.shape[
            0] != 200)] = lsi_emb_a[lsi_emb_a.apply(
                lambda x: len(x) == 0 or x.shape[0] != 200)].apply(
                    lambda x: np.zeros((200, 2)))
        lsi_emb_b[lsi_emb_b.apply(lambda x: len(x) == 0 or x.shape[
            0] != 200)] = lsi_emb_b[lsi_emb_b.apply(
                lambda x: len(x) == 0 or x.shape[0] != 200)].apply(
                    lambda x: np.zeros((200, 2)))

        # Derive question representations from single lsi vectors
        lsi_emb_a = lsi_emb_a.apply(lambda x: np.mean(x, axis=0))
        lsi_emb_b = lsi_emb_b.apply(lambda x: np.mean(x, axis=0))
        lsi_emb_a = np.vstack(lsi_emb_a.values)
        lsi_emb_b = np.vstack(lsi_emb_b.values)

        df_features["lsi_dist_cosine"] = paired_cosine_distances(
            lsi_emb_a, lsi_emb_b)
        df_features["lsi_dist_euclidean"] = paired_euclidean_distances(
            lsi_emb_a, lsi_emb_b)

        print("--> Compute word2vec distance...")
        word2Vec_emb_a = df_intermediate["words_clean_a"].apply(
            lambda q: np.array([word2Vec.wv[w] for w in q
                                if w in word2Vec.wv]))
        word2Vec_emb_b = df_intermediate["words_clean_b"].apply(
            lambda q: np.array([word2Vec.wv[w] for w in q
                                if w in word2Vec.wv]))

        word2Vec_emb_a[word2Vec_emb_a.apply(
            lambda x: len(x) == 0)] = word2Vec_emb_a[word2Vec_emb_a.apply(
                lambda x: len(x) == 0)].apply(lambda y: np.zeros((1, 100)))
        word2Vec_emb_b[word2Vec_emb_b.apply(
            lambda x: len(x) == 0)] = word2Vec_emb_b[word2Vec_emb_b.apply(
                lambda x: len(x) == 0)].apply(lambda y: np.zeros((1, 100)))

        word2Vec_emb_a = word2Vec_emb_a.apply(lambda x: np.mean(x, axis=0))
        word2Vec_emb_b = word2Vec_emb_b.apply(lambda x: np.mean(x, axis=0))
        word2Vec_emb_a = np.vstack(word2Vec_emb_a.values)
        word2Vec_emb_b = np.vstack(word2Vec_emb_b.values)

        df_features["w2v_dist_cosine"] = paired_cosine_distances(
            word2Vec_emb_a, word2Vec_emb_b)
        df_features["w2v_dist_euclidean"] = paired_euclidean_distances(
            word2Vec_emb_a, word2Vec_emb_b)
        df_features["word2vec_word_mover_dist"] = df_intermediate.apply(
            lambda row: word2Vec.wv.wmdistance(row["words_clean_a"], row[
                "words_clean_b"]),
            axis=1)

        print("--> Compute doc2vec distance...")
        doc_vec_dm_emb_a = df_intermediate["words_clean_a"].apply(
            lambda q: d2v_dm.docvecs[q2idx_dict[tuple(q)]])
        doc_vec_dm_emb_b = df_intermediate["words_clean_b"].apply(
            lambda q: d2v_dm.docvecs[q2idx_dict[tuple(q)]])
        doc_vec_bow_emb_a = df_intermediate["words_clean_a"].apply(
            lambda q: d2v_bow.docvecs[q2idx_dict[tuple(q)]])
        doc_vec_bow_emb_b = df_intermediate["words_clean_b"].apply(
            lambda q: d2v_bow.docvecs[q2idx_dict[tuple(q)]])
        doc_vec_dm_emb_a = np.vstack(doc_vec_dm_emb_a.values)
        doc_vec_dm_emb_b = np.vstack(doc_vec_dm_emb_b.values)
        doc_vec_bow_emb_a = np.vstack(doc_vec_bow_emb_a.values)
        doc_vec_bow_emb_b = np.vstack(doc_vec_bow_emb_b.values)

        df_features["dm_dist_cosine"] = paired_cosine_distances(
            doc_vec_dm_emb_a, doc_vec_dm_emb_b)
        df_features["dm_dist_euclidean"] = paired_euclidean_distances(
            doc_vec_dm_emb_a, doc_vec_dm_emb_b)
        df_features["dm_word_mover_dist"] = df_intermediate.apply(
            lambda row: d2v_dm.wv.wmdistance(row["words_clean_a"], row[
                "words_clean_b"]),
            axis=1)

        df_features["bow_dist_cosine"] = paired_cosine_distances(
            doc_vec_bow_emb_a, doc_vec_bow_emb_b)
        df_features["bow_dist_euclidean"] = paired_euclidean_distances(
            doc_vec_bow_emb_a, doc_vec_bow_emb_b)
        df_features["bow_word_mover_dist"] = df_intermediate.apply(
            lambda row: d2v_bow.wv.wmdistance(row["words_clean_a"], row[
                "words_clean_b"]),
            axis=1)

        print("--> Compute edit distance...")
        df_features["edit_distance"] = df_intermediate.apply(
            lambda x: nltk.edit_distance(x["clean_a"], x["clean_b"]), axis=1)

        return df_features

    print("Compute train features...")
    train_features = preprocess(train)

    print("Compute test features...")
    test_features = preprocess(test)

    print("Store features...")
    train_features.to_csv("./data/distance_features_train.csv", index=False)
    test_features.to_csv("./data/distance_features_test.csv", index=False)
import numpy as np
from gensim import corpora, utils, similarities
from gensim.parsing.preprocessing import STOPWORDS
from gensim.models.doc2vec import Doc2Vec, Word2Vec
#from read_wiki import stream
import pdb


dir_path = './'
#doc2vec_path = dir_path + 'wiki_model2.doc2vec'
#doc2vec300_path = dir_path + 'wiki_model5.doc2vec'
word2vec_path = dir_path + 'pretrained_word2vec.bin'

print("Loading word2vec and doc2vec models")
#doc2vec_model = Doc2Vec.load(doc2vec_path)
word2vec_model = Word2Vec.load_word2vec_format(word2vec_path, binary=True)
#doc2vec300_model = Doc2Vec.load(doc2vec300_path)
print("Models loaded, proceeding...")
np.random.seed(42)
random_word = np.random.uniform(low=-0.25, high=0.25, size=(300,))


def tokenize(text, k):
    return [token for token in utils.simple_preprocess(text) if token not in STOPWORDS]


def lda(text):
	dict_path = dir_path + 'cs_lda6.dict'
	lda_path = dir_path + 'wiki_model6.ldamodel'
	dictionary = corpora.Dictionary.load(dict_path)
	model = gensim.models.ldamodel.LdaModel.load(lda_path)