示例#1
0
 def __init__(self, G, window_size=5, embedding_size=64, walks_per_vertex=30, walk_length=16):
     self.window_size = window_size
     self.embedding_size = embedding_size
     self.walks_per_vertex = walks_per_vertex
     self.walk_length = walk_length
     self.whole_size = len(G.nodes)
     
     self.G = G
     self.Theta = np.random.rand(self.whole_size, embedding_size)
     self.skipgram = SkipGram(window_size, embedding_size, self.whole_size, 0.01, \
         Theta=self.Theta)
示例#2
0
    def __init__(self, window_size, vocab_size, embedding_size, batch_size,
                 model_path):
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.batch_size = batch_size

        if os.path.isfile("word2vec_dataset.p"):
            self.dataset = pkl.load(open("word2vec_dataset.p", "rb"))
        else:
            self.dataset = APDataset(window_size, vocab_size)
            pkl.dump(self.dataset, open("word2vec_dataset.p", "wb"))

        self.data_generator = self.dataset.get_batch(self.batch_size)

        self.skip_gram = SkipGram(self.dataset.vocab_size, embedding_size)

        if os.path.isfile(model_path):
            self.skip_gram.load_state_dict(torch.load(model_path))
            self.skip_gram.eval()
示例#3
0
 def __init__(self,
              filename='./text8.zip',
              word_num=200,
              batch_size=8,
              skip_window=2,
              num_skips=2,
              embed_dim=10,
              epoch=100,
              lr=0.025,
              neg_cnt=5,
              outfile='./skip_gram',
              dictfile='./word_dict'):
     """Init this word2vec model"""
     # params about dataset
     self.batch_size = batch_size
     self.skip_window = skip_window
     self.num_skips = num_skips
     # params about skip gram
     self.embed_num = word_num
     self.embed_dim = embed_dim
     # params about learning
     self.epoch = epoch
     self.lr = lr
     self.neg_cnt = neg_cnt
     # dataset
     self.dataset = Dataset(filename, word_num)
     if (not os.path.exists(dictfile)):
         pickle.dump(self.dataset.word_dict, open(dictfile, 'wb'))
     # skip gram
     self.outfile = outfile
     if (os.path.exists(outfile)):
         self.skip_gram = pickle.load(open(self.outfile, 'rb'))
     else:
         self.skip_gram = SkipGram(word_num, embed_dim)
     # optimizer
     self.optimizer = optim.SGD(self.skip_gram.parameters(), lr=self.lr)
示例#4
0
from scipy.special import expit
from gensim import utils, matutils  # utility fnc for pickling, common scipy operations etc
from gensim.corpora.dictionary import Dictionary
from six import iteritems, itervalues, string_types
from six.moves import xrange
from types import GeneratorType
from scipy import stats
from queue import Queue, Empty

logger = logging.getLogger(__name__)

from SkipGram import SkipGram
from BuildVocab import BuildVocab
from NegativeSampling import NegativeSampling

sg = SkipGram()
bv = BuildVocab()
ns = NegativeSampling()

class WIC2Vec:

    def __init__(
            self, articles=None, category_size = 10, word_size=20, alpha=0.025, window=3, min_count=0,
            max_vocab_size=None, seed=1, workers=3, min_alpha=0.0001, negative=5, hashfxn=hash, iter=5, null_word=0):

        self.initialize_word_vectors()
        self.cum_table = None  # for negative sampling
        self.category_vector_size = int(category_size)
        self.category_layer1_size = int(category_size)
        self.word_vector_size = int(word_size)
        self.word_layer1_size = int(word_size)
示例#5
0
class DeepWalk():
    
    def __init__(self, G, window_size=5, embedding_size=64, walks_per_vertex=30, walk_length=16):
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.walks_per_vertex = walks_per_vertex
        self.walk_length = walk_length
        self.whole_size = len(G.nodes)
        
        self.G = G
        self.Theta = np.random.rand(self.whole_size, embedding_size)
        self.skipgram = SkipGram(window_size, embedding_size, self.whole_size, 0.01, \
            Theta=self.Theta)

    def __call__(self, unit_iter=5, beta=0.9, showLoss=False):
        '''
        Run DeepWalk algorithm.
        '''
        valid_length = 2*self.window_size+1
        for _ in range(self.walks_per_vertex):
            nodes = self.G.nodes().numpy()
            np.random.shuffle(nodes)
            for v in nodes:
                print('-----------', v, '------------')
                walk = self.RandomWalk(v, In_Out='Both')

                if len(walk) < valid_length:
                    print('Fail to walk long enough, skip once.')
                    continue
                self.skipgram.walk_train(walk, unit_iter=unit_iter, showLoss=showLoss)
            # One of the effective ways to facilitate convergence is to use
            #  diminishing stepsize
            self.skipgram.step_size = self.skipgram.step_size*beta

        print('Deepwalk finished.')

    def RandomWalk(self, vi_index, In_Out='Both'):
        '''
        Return a walk on graph G from vi, 
        controlled by parameters window_size, walks_per_vertex and walk_length.
        ***** DO NOT distinguish in_edges from out_edges by default *****
        In_Out='In': In edges only; 'Out': Out edges only.
        '''
        length = 0
        current = vi_index
        walk = np.empty(self.walk_length, dtype=int)
        if In_Out == 'Both':
            while length < self.walk_length:
                walk[length] = current
                pool_out = self.G.out_edges(current)[1].numpy()
                pool_in = self.G.in_edges(current)[0].numpy()
                pool = np.concatenate((pool_in, pool_out))
                if len(pool) == 0:
                    return walk
            
                current = np.random.choice(pool)
                length += 1
        elif In_Out == 'In':
            while length < self.walk_length:
                walk[length] = current
                pool = self.G.in_edges(current)[0].numpy()
                if len(pool) == 0:
                    return walk

                current = np.random.choice(pool)
                length += 1
        elif In_Out == 'Out':
            while length < self.walk_length:
                walk[length] = current
                pool = self.G.out_edges(current)[1].numpy()
                if len(pool) == 0:
                    return walk

                current = np.random.choice(pool)
                length += 1
        return walk

    def predict(self, vi_index=0):
        '''
        Return descending indexes rank according to probabilities for vertex vi.
        '''
        return self.skipgram.testprob(vi_index).argsort()[::-1]


    def save(self, filename):
        with open(filename, 'wb') as f:
            pickle.dump(self, f)
            print('Model saved to file.')
示例#6
0
vocab_length = len(word_counts)
vocab = [p[0] for p in word_counts.most_common()]
unigrams = [p[1] for p in word_counts.most_common()]

word2int = {p[0]: i for i, p in enumerate(word_counts.most_common())}
int2word = {i: p[0] for i, p in enumerate(word_counts.most_common())}
text_indices = np.array([word2int[w] for w in words])

t = 1e-5
sampling_prob = np.sqrt(t / (unigrams / np.sum(unigrams)))
sampling_prob = np.minimum(1, sampling_prob)
# skipgrams() assumes 0 is not a word, so some shifting is done
sampling_table = np.concatenate(([0], sampling_prob))

sg = SkipGram(vocab_length, emb_length=128)
n_epochs = 10
for epoch in range(1, n_epochs + 1):
    load_prev = False if epoch == 1 else True

    # skipgrams() assumes 0 is not a word, so some shifting is done
    idx_couples = np.array(
        skipgrams(text_indices + 1,
                  vocab_length + 1,
                  window_size=4,
                  sampling_table=sampling_table,
                  negative_samples=0.)[0]) - 1
    word_indices = idx_couples[:, 0]
    context_indices = idx_couples[:, 1].reshape(-1, 1)

    sg.train(word_indices,
示例#7
0
# and a dictionary of unique words as keys and their index as values
tokenized, vocabulary = getTokenizedData(movieData["review"])

# the number of words and dimension and a size of window
vocabulary_size = len(vocabulary)
dimension_size = 100
window = 4
learning_rate = 0.001
batch_size = 100

# Total 50000 reviews,
# composed of 616273 words and 89858 unique words.
print("Total", len(movieData["review"]), "reviews,")
print("composed of", len(tokenized), "words and", len(vocabulary), "unique words.")

model = SkipGram(vocabulary_size, dimension_size).to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

print("Training will be initiated with")
print(model)

print()
print("-----Training starts-----")
print()

word_cnt = 0
avg_loss = 0
epoch_cnt = 0
示例#8
0
class Word2VecRetrieval():
    def __init__(self, window_size, vocab_size, embedding_size, batch_size,
                 model_path):
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.batch_size = batch_size

        if os.path.isfile("word2vec_dataset.p"):
            self.dataset = pkl.load(open("word2vec_dataset.p", "rb"))
        else:
            self.dataset = APDataset(window_size, vocab_size)
            pkl.dump(self.dataset, open("word2vec_dataset.p", "wb"))

        self.data_generator = self.dataset.get_batch(self.batch_size)

        self.skip_gram = SkipGram(self.dataset.vocab_size, embedding_size)

        if os.path.isfile(model_path):
            self.skip_gram.load_state_dict(torch.load(model_path))
            self.skip_gram.eval()

    def train(self):

        optimizer = torch.optim.SparseAdam(self.skip_gram.parameters())

        self.skip_gram.train()

        print('Training...')

        step = 0

        for batch in self.data_generator:

            step += 1
            focus = [sample[0] for sample in batch]
            focus = torch.tensor(focus)

            pos_context = [sample[1] for sample in batch]
            pos_context = torch.tensor(pos_context)

            neg_context = [sample[2] for sample in batch]
            neg_context = torch.tensor(neg_context)

            optimizer.zero_grad()
            loss = self.skip_gram.forward(focus, pos_context, neg_context,
                                          self.batch_size)
            loss.backward()
            optimizer.step()

            if step % 100 == 0:
                print('Step: ', step, ', Loss: ', loss)

            if step % 20000 == 0:
                torch.save(self.skip_gram.state_dict(),
                           './saved_models/word2vec.pt'.format(step))
                self.skip_gram.save_embedding(self.dataset.id2word,
                                              './word2vec_embedding.pkl')

    def find_similar_words(self, query, n=11):
        word_to_vec = pkl.load(open("word2vec_embedding.pkl", "rb"))
        query = process_text(query)[0]
        word_vec = word_to_vec[query]
        distances = []
        for index, word_key in enumerate(word_to_vec):
            cos_sim = np.dot(word_vec, word_to_vec[word_key]) / (
                np.linalg.norm(word_vec) *
                np.linalg.norm(word_to_vec[word_key]))
            distances.append((index, word_key, cos_sim))
        sorted_by_distance = sorted(distances,
                                    reverse=True,
                                    key=lambda tup: tup[2])
        for matching_word in sorted_by_distance[:n]:
            print(matching_word[1])

    def embed_query(self, word_to_vec, query, aggregation='mean'):

        query_repr = process_text(query)

        doc = []
        for query_term in query_repr:
            if query_term not in word_to_vec:
                continue
            else:
                doc.append(word_to_vec[query_term])

        if aggregation == 'mean':
            doc = np.mean(doc, axis=0)
        return doc

    def embed_doc(self, doc, word_to_vec, aggregation='mean'):

        doc_repr = []
        for doc_term in doc:
            if doc_term not in word_to_vec:
                continue
            else:
                doc_repr.append(word_to_vec[doc_term])

        if aggregation == 'mean':
            doc_repr = np.mean(doc_repr, axis=0)
        return doc_repr

    def make_doc_repr(self):
        word_to_vec = pkl.load(open("word2vec_embedding.pkl", "rb"))
        doc_reprs = []
        for doc_id in self.dataset.docs_by_id:
            print(doc_id)
            doc = self.dataset.docs_by_id[doc_id]
            doc_embed = self.embed_doc(doc, word_to_vec)
            doc_reprs.append((doc_id, doc_embed))
        pkl.dump(doc_reprs, open("doc_embeds.p", "wb"))

    def search(self, doc_embeds, query):

        word_to_vec = pkl.load(open("word2vec_embedding.pkl", "rb"))
        query_ranking = []
        query_embed = self.embed_query(word_to_vec, query)

        for doc_id, doc_embed in doc_embeds:
            similarity = np.dot(query_embed, doc_embed) / \
                (np.linalg.norm(query_embed)*np.linalg.norm(doc_embed))
            query_ranking.append((doc_id, float(similarity)))
        sorted_by_distance = sorted(query_ranking,
                                    reverse=True,
                                    key=lambda tup: tup[1])
        return sorted_by_distance
示例#9
0
class Word2Vec():
    """Main class for word2vec model
    
    Attributes:
        filename:    filename of input training data
        word_num:    number of expected number of training words
        batch_size:  number of training samples in one batch
        skip_window: length of one skip window on one side of target [skip_windos, target ,skip_window]
        num_skips:   number of selected context word in one skip
        embed_dim:   size of each embedding vector
        epoch:       number of iteration 
        lr:          learning rate
        neg_cnt:     number of negative samples for one x
        outfile:     filename of output trained skip gram model 
        dictfile:    filename of word dict for storing word dict
    """
    def __init__(self,
                 filename='./text8.zip',
                 word_num=200,
                 batch_size=8,
                 skip_window=2,
                 num_skips=2,
                 embed_dim=10,
                 epoch=100,
                 lr=0.025,
                 neg_cnt=5,
                 outfile='./skip_gram',
                 dictfile='./word_dict'):
        """Init this word2vec model"""
        # params about dataset
        self.batch_size = batch_size
        self.skip_window = skip_window
        self.num_skips = num_skips
        # params about skip gram
        self.embed_num = word_num
        self.embed_dim = embed_dim
        # params about learning
        self.epoch = epoch
        self.lr = lr
        self.neg_cnt = neg_cnt
        # dataset
        self.dataset = Dataset(filename, word_num)
        if (not os.path.exists(dictfile)):
            pickle.dump(self.dataset.word_dict, open(dictfile, 'wb'))
        # skip gram
        self.outfile = outfile
        if (os.path.exists(outfile)):
            self.skip_gram = pickle.load(open(self.outfile, 'rb'))
        else:
            self.skip_gram = SkipGram(word_num, embed_dim)
        # optimizer
        self.optimizer = optim.SGD(self.skip_gram.parameters(), lr=self.lr)

    def train(self):
        """Start training the model and embedding"""
        batch_num = len(self.dataset.data) - 2 * self.skip_window
        bar = tqdm(range(self.epoch * batch_num))
        for i in bar:
            x, y = self.dataset.gen_batch(self.batch_size, self.skip_window,
                                          self.num_skips)
            neg = self.dataset.get_neg_sample(len(x), self.neg_cnt)
            x = Variable(torch.LongTensor(x))
            y = Variable(torch.LongTensor(y))
            neg = Variable(torch.LongTensor(neg))
            # backprop
            self.optimizer.zero_grad()
            loss = self.skip_gram.forward(x, y, neg)
            loss.backward()
            self.optimizer.step()
            # set output
            if (i % 200000 == 0):
                pickle.dump(self.skip_gram, open(self.outfile, 'wb'))
            bar.set_description("Loss: %0.8f" % loss.data)
            # pickle
        pickle.dump(self.skip_gram, open(self.outfile, 'wb'))
示例#10
0
def main():
    print("Calculating task 2")

    sg = SkipGram(iter_corpus, load=False)
    sg.save()

    def evaluate(sentences_from, sentences_to, poly_from, poly_to):
        count_total = {}
        count_polysemous = {}
        count_untagged_correct = {}
        count_untagged_wrong = {}
        count_tagged_correct = {}
        count_tagged_wrong = {}
        count_tagged_unexpected = {}

        tagged = []

        for sidx in range(len(sentences_from)):
            sentence_from = {}
            for word in sentences_from[sidx][0]:
                if word[-1] not in sentence_from:
                    sentence_from[word[-1]] = []
                sentence_from[word[-1]].append(word)

            sentence_to = {}
            for word in sentences_to[sidx][0]:
                if word[-1] not in sentence_to:
                    sentence_to[word[-1]] = []
                sentence_to[word[-1]].append(word)

            for widx in sentence_from.keys():
                words_from = sentence_from[widx]
                if widx not in sentence_to.keys():
                    continue
                context = tuple((w[1] if w[1] else w[0]).lower()
                                for w in sentences_from[sidx][0])
                for word_from in words_from:
                    _type = word_from[2] if word_from[2] else "UNK"
                    if _type not in count_total:
                        count_total[_type] = 0
                    count_total[_type] += 1

                    poly = word_from[1], word_from[2]
                    poly = (poly in poly_from)
                    if poly:
                        if _type not in count_polysemous:
                            count_polysemous[_type] = 0
                        count_polysemous[_type] += 1

                        choice = sg.choose(context,
                                           (word_from[1] if word_from[1] else
                                            word_from[0]).lower())

                        if choice is not None:
                            lemma = (word_from[1] if word_from[1] else
                                     word_from[0]).lower()
                            #         lemma                      Word-Type                        Sense
                            if lemma == choice[1]:
                                if (
                                        word_from[3] == "" and choice[2] == -1
                                ) or (
                                        word_from[3] == choice[2]
                                ):  #word_from[2] == choice[0] and word_from[3] == choice[2]:
                                    if _type not in count_tagged_correct:
                                        count_tagged_correct[_type] = 0
                                    count_tagged_correct[_type] += 1
                                    continue

                            if word_from[3] == "" or word_from[3] == -1:
                                if _type not in count_tagged_unexpected:
                                    count_tagged_unexpected[_type] = 0
                                count_tagged_unexpected[_type] += 1
                            else:
                                if _type not in count_tagged_wrong:
                                    count_tagged_wrong[_type] = 0
                                count_tagged_wrong[_type] += 1

                            if _type not in count_tagged_wrong:
                                count_tagged_wrong[_type] = 0
                            count_tagged_wrong[_type] += 1
                        else:
                            if word_from[3] == "" or word_from[3] == -1:
                                if _type not in count_untagged_correct:
                                    count_untagged_correct[_type] = 0
                                count_untagged_correct[_type] += 1
                            else:
                                if _type not in count_untagged_wrong:
                                    count_untagged_wrong[_type] = 0
                                count_untagged_wrong[_type] += 1

        return count_total, count_polysemous, count_untagged_correct, count_untagged_wrong, count_tagged_correct, count_tagged_wrong, count_tagged_unexpected

    mapping, _, pe, pg = map_with_ili()
    pe = list(pe)
    pg = list(pg)

    de, en = get_test_data()
    count_total, count_polysemous, count_untagged_correct, count_untagged_wrong, count_tagged_correct, count_tagged_wrong, count_tagged_unexpected = evaluate(
        en, de, pe, pg)

    print("Evaluating test corpus")
    total = sum(count_total.values())

    print("Total words in test corpora", total)
    for t in count_total:
        print("Count of {}: {}({}% of test corpus)".format(
            t, count_total[t], count_total[t] / total * 100))
    print()

    for t in count_polysemous:
        print("{}: Polysemous {}({}% of test corpus)".format(
            t, count_polysemous[t], count_polysemous[t] / total * 100))
    print()

    for t in count_untagged_correct:
        print("{}: Correct Untagged {}({}% of test corpus, {}% of polysemous)".
              format(t, count_untagged_correct[t],
                     count_untagged_correct[t] / total * 100,
                     count_untagged_correct[t] / count_polysemous[t] * 100))
    for t in count_untagged_wrong:
        print(
            "{}: Incorrect Untagged {}({}% of test corpus, {}% of polysemous)".
            format(t, count_untagged_wrong[t],
                   count_untagged_wrong[t] / total * 100,
                   count_untagged_wrong[t] / count_polysemous[t] * 100))

    for t in count_tagged_correct:
        print("{}: Correct tagged {}({}% of test corpus, {}% of polysemous)".
              format(t, count_tagged_correct[t],
                     count_tagged_correct[t] / total * 100,
                     count_tagged_correct[t] / count_polysemous[t] * 100))
    for t in count_untagged_wrong:
        print("{}: Incorrect tagged {}({}% of test corpus, {}% of polysemous)".
              format(t, count_tagged_wrong[t],
                     count_tagged_wrong[t] / total * 100,
                     count_tagged_wrong[t] / count_polysemous[t] * 100))
    for t in count_tagged_unexpected:
        print(
            "{}: Unexpected tagged {}({}% of test corpus, {}% of polysemous)".
            format(t, count_tagged_unexpected[t],
                   count_tagged_unexpected[t] / total * 100,
                   count_tagged_unexpected[t] / count_polysemous[t] * 100))