Exemplo n.º 1
0
def encode_dataset(dataset, max_len, enc_dim):
    # Create a model to represent each word by a enc_dim dimensional vector,
    # transform only max_len words to represent each document

    dataset['text'] = dataset['text'].map(
        lambda text: nltk.tokenize.word_tokenize(text))
    dataset['length'] = dataset['text'].map(lambda text: len(text))

    model = W2VTransformer(size=enc_dim, min_count=1, seed=1)
    wordvecs = model.fit(dataset['text'].values)

    embedings = []
    targets = []
    for row in range(len(dataset['text'])):
        if dataset['length'][row] <= max_len:
            embedings.append(wordvecs.transform(dataset['text'][row]))
            targets.append(dataset['spam'][row])
        else:
            embedings.append(wordvecs.transform(
                dataset['text'][row][:max_len]))
            targets.append(dataset['spam'][row])

    x_lstm_sentence_seq = keras.preprocessing.sequence.pad_sequences(embedings)

    return x_lstm_sentence_seq, targets
Exemplo n.º 2
0
 def fit(self, x, y=None):
     """
     Fits a word2vec model on x : list of sentences
     """
     self.tokens = [s.split() for s in x]
     self.wordvecs = W2VTransformer(size=self.size, min_count=self.min_count, window=self.window, sg=self.sg).fit(self.tokens)
     return self
Exemplo n.º 3
0
    def w2v_scikit(data):
        from gensim.test.utils import common_texts
        from gensim.sklearn_api import W2VTransformer

        # Create a model to represent each word by a 10 dimensional vector.
        model = W2VTransformer(size=len(data), min_count=1, seed=1)
        #print(model.gensim_model.wv.vocab)

        # What is the vector representation of the word 'graph'?
        wordvecs = model.fit(data).transform(['taken', 'arms'])
        assert wordvecs.shape == (len(data))
        return wordvecs
Exemplo n.º 4
0
    def __init__(self,
                 corpus,
                 idx,
                 dim=50,
                 window=3,
                 training_algorithm='skip',
                 n_epochs=5):
        self.name = 'word2vec'
        self.corpus = corpus
        self.idx = idx

        self.dim = dim
        self.window = window
        self.n_epochs = n_epochs

        logger.info("Inferring word2vec from data")
        self.corpus = [simple_preprocess(doc, deacc=True) for doc in corpus]
        self.vectorizer = W2VTransformer(
            size=dim,
            window=window,
            sg=0 if training_algorithm == 'skip' else 1,
            iter=n_epochs)
        self.vectorizer = self.vectorizer.fit(self.corpus)
        self.vectors = []
        for doc in self.corpus:
            doc_vector = []
            for word in doc:
                try:
                    doc_vector.append(self.vectorizer.transform(word))
                except:
                    continue
            if len(doc_vector) > 0:
                self.vectors.append(np.mean(doc_vector, axis=0))
            else:
                self.vectors.append(np.ones(shape=(1, dim)))
        self.vectors = np.concatenate(self.vectors, axis=0)
# for full details.

#%%
sizes_list = []
to_concat = []
split = 0
for train, val in ShuffleSplit(n_splits=5).split(d.enc):
	print('Performing cross-validation split: {}'.format(split))
	
	# prepare the data for the fold
	d.cross_val_split(train, val)
	profiles_train, targets_train, seq_train, active_meds_train, depa_train, targets_val, seq_val, active_meds_val, depa_val = d.make_lists()
	
	# train word2vec embeddings
	w2v = Pipeline([
	('w2v', W2VTransformer(alpha=W2V_ALPHA, iter=W2V_ITER, size=W2V_EMBEDDING_DIM, hs=W2V_HS, sg=W2V_SG, min_count=W2V_MIN_COUNT, workers=W2V_WORKERS)),
	])
	print('Fitting word2vec embeddings...')
	w2v.fit(profiles_train)
	w2v.named_steps['w2v'].gensim_model.init_sims(replace=True)
	
	# fit the profile state encoder pipeline
	print('Fitting PSE...')
	pse_data = [[ap, de] for ap, de in zip(active_meds_train, depa_train)]
	n_pse_columns = len(pse_data[0])
	pse_transformers = []
	for i in range(n_pse_columns):
		pse_transformers.append(('pse{}'.format(i), CountVectorizer(binary=True, lowercase=False, preprocessor=pse_pp, analyzer=pse_a), i))
	pse_pipeline_transformers = [
		('columntrans', ColumnTransformer(transformers=pse_transformers))
		]
with open(profiles_path, mode='rb') as file:
    data = pickle.load(file)
data = list(data.values())
print('Data successfully loaded.')

#%%[markdown]
# ## Transformers
#
# Prepare the word2vec and clustering transformers

#%%[markdown]
# ### Word2vec transformer

#%%
w2v_pipe = Pipeline([
    ('w2v', W2VTransformer()),
])

#%%[markdown]
# ### Clustering transformer

#%%
clust_pipe = Pipeline([
    ('ac', AgglomerativeClustering()),
])

#%%[markdown]
# ## Helper functions
#
# These are scoring functions that will be used to score
# the word2vec embeddings and the clustering of the embeddings.
Exemplo n.º 7
0
def do_word2vec(data):
    data = tokenize(data)
    features = pd.DataFrame()
    model = W2VTransformer(size=1, min_count=1, seed=42)
    return fit_transform(model, data)
Exemplo n.º 8
0
def toy_model_keyed_vectors():
    """ Instantiate trainable word2vec vectorizer """
    model = W2VTransformer(size=10, min_count=1, seed=42)
    model.fit(common_texts)
    return model.gensim_model.wv
Exemplo n.º 9
0
def wrd2vc():

    return W2VTransformer(size=300, window=3, min_count=3, sg=1, trim_rule=lemmatize)
Exemplo n.º 10
0
from gensim.test.utils import common_texts
from gensim.sklearn_api import W2VTransformer
import code

# Create a model to represent each word by a 10 dimensional vector.
model = W2VTransformer(size=10, min_count=1, seed=1)

# What is the vector representation of the word 'graph'?
wordvecs = model.fit(common_texts).transform(['graph', 'system'])
assert wordvecs.shape == (2, 10)
code.interact(local=locals())