示例#1
0
    def __init__(self, model_path, context_path, learning_rate=0.001, n_negative_sample=15):
        self.E = utils.load_pkl(model_path + 'embedding.pkl', local=True)
        self.E = np.array(self.E)
        self.F = utils.load_pkl(model_path + 'softmax_w.pkl', local=True)
        self.n_vocab = len(self.E)
        self.d = self.F.shape[1]
        self.n_context = self.F.shape[0]
        self.data_path = model_path
        self.n_negative_sample = n_negative_sample
        self.scope = 0

        # Context distribution
        self.context_distribution = utils.load_pkl(context_path + config['SNML']['context_dist'], local=True)
        self.context_distribution = self.context_distribution ** (3 / 4)
        self.context_distribution = self.context_distribution / sum(self.context_distribution)

        # Optimizer initialize
        self.lr = learning_rate

        # Context sample look up table
        table_size = 100000000  # Length of the unigram table
        table = np.zeros(table_size, dtype=np.uint32)

        p = 0  # Cumulative probability
        i = 0
        for j in range(self.n_context):
            p += self.context_distribution[j]
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
        self.table = table

        # Set random seed
        np.random.seed(int(config['OTHER']['random_seed']))
示例#2
0
    def __init__(self, data_path, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-08):
        self.E = utils.load_pkl(data_path + 'embedding.pkl')
        self.C = utils.load_pkl(data_path + 'softmax_w.pkl')
        self.b = utils.load_pkl(data_path + 'softmax_b.pkl')
        self.V = self.E.shape[0]
        self.K = self.E.shape[1]
        self.V_dash = self.C.shape[0]
        self.data_path = data_path

        # adam optimizer initialize
        self.t = 256040
        self.t_default = 256040
        self.beta1 = beta1
        self.beta2 = beta2
        self.lr = learning_rate
        self.epsilon = epsilon
        self.beta1_t = beta1 ** self.t
        self.beta2_t = beta2 ** self.t

        # initialize things
        self.mE_t = np.zeros((self.V, self.K))
        self.mC_t = np.zeros((self.V_dash, self.K))
        self.mb_t = np.zeros(self.V_dash)
        self.vE_t = np.zeros((self.V, self.K))
        self.vC_t = np.zeros((self.V_dash, self.K))
        self.vb_t = np.zeros(self.V_dash)
示例#3
0
    def __init__(self,
                 model_path,
                 context_path,
                 n_neg_sample=200,
                 n_context_sample=1000,
                 learning_rate=0.0004,
                 random_seed=1234):
        # Load parameters
        self.embedding = utils.load_pkl(model_path +
                                        config['SNML']['embedding'])
        self.softmax_w = utils.load_pkl(model_path +
                                        config['SNML']['softmax_w'])
        self.n_vocab = self.embedding.shape[0]
        self.n_embedding = self.embedding.shape[1]
        self.n_context = self.softmax_w.shape[0]
        self.n_neg_sample = n_neg_sample
        self.context_path = context_path
        self.n_context_sample = n_context_sample
        self.scope = 0
        self.learning_rate = learning_rate

        # paths
        self.data_path = model_path

        # Set uniform distribution as default for context sampling
        self.contexts = []

        # set computation
        self._set_computation(random_seed)
    def __init__(self,
                 data_path,
                 context_path,
                 learning_rate=0.2,
                 beta=0.9,
                 n_context_sample=600):
        self.E = utils.load_pkl(data_path + 'embedding.pkl')
        self.C = utils.load_pkl(data_path + 'softmax_w.pkl')
        self.b = utils.load_pkl(data_path + 'softmax_b.pkl')
        self.V = self.E.shape[0]
        self.K = self.E.shape[1]
        self.V_dash = self.C.shape[0]
        self.data_path = data_path
        self.context_path = context_path
        self.n_context_sample = n_context_sample
        self.scope = 0

        # Load context distribution
        self.context_distribution = utils.load_pkl(context_path +
                                                   'context_distribution.pkl')
        # Check if sample context file exits
        self.sample_contexts_file_name = os.path.join(
            self.context_path,
            'sample_contexts_{}.pkl'.format(n_context_sample))
        self.contexts = utils.load_pkl(self.sample_contexts_file_name)

        # Momentum hyper parameters
        self.learning_rate = learning_rate
        self.beta = beta
        self.vE = np.zeros((self.V, self.K))
        self.vC = np.zeros((self.V_dash, self.K))
        self.vb = np.zeros(self.V_dash)
示例#5
0
    def __init__(self, model_path, data_file):
        # Load parameters
        self.embedding = utils.load_pkl(model_path +
                                        config['SNML']['embedding'])
        self.softmax_w = utils.load_pkl(model_path +
                                        config['SNML']['softmax_w'])
        self.n_vocab = self.embedding.shape[0]
        self.n_embedding = self.embedding.shape[1]
        self.n_context = self.softmax_w.shape[0]
        self.batch_size = 5000
        self.sum_log_likelihood = 0
        self.k = self.embedding.shape[0] * self.embedding.shape[1] + \
                 self.softmax_w.shape[0] * self.softmax_w.shape[1]
        print('Matrix E: Vw * d: ', self.embedding.shape[0],
              self.embedding.shape[1])
        print('Matrix F: d * Bc: ', self.embedding.shape[0],
              self.embedding.shape[1])
        print('k: ', self.k)

        # paths
        self.model_path = model_path
        self.filename = data_file

        # set computation
        self._set_computation(self.filename,
                              batch_size=self.batch_size,
                              epochs=1)
示例#6
0
    def change_model(self, model_path):
        # Load parameters
        self.embedding = utils.load_pkl(model_path +
                                        config['SNML']['embedding'])
        self.softmax_w = utils.load_pkl(model_path +
                                        config['SNML']['softmax_w'])
        self.softmax_b = utils.load_pkl(model_path +
                                        config['SNML']['softmax_b'])
        self.n_vocab = self.embedding.shape[0]
        self.n_embedding = self.embedding.shape[1]
        self.n_context = self.softmax_w.shape[0]

        # paths
        self.data_path = model_path

        # set computation
        self._set_computation()
示例#7
0
    def __init__(self, input_path, output_path, n_embedding):
        self.n_embedding = n_embedding
        self.embedding = np.array([])
        self.data_path = input_path

        # create output directory
        self.output_dictionary = output_path + config['TRAIN'][
            'output_dir'].format(n_embedding)
        if not os.path.exists(self.output_dictionary):
            os.makedirs(self.output_dictionary)

        # read dictionaries
        self.int_to_vocab = utils.load_pkl(input_path +
                                           config['TRAIN']['vocab_dict'])
        self.int_to_cont = utils.load_pkl(input_path +
                                          config['TRAIN']['context_dict'])
        self.n_vocab = len(self.int_to_vocab)
        self.n_context = len(self.int_to_cont)
示例#8
0
    def __init__(self,
                 data_path,
                 context_path,
                 learning_rate=0.001,
                 beta1=0.9,
                 beta2=0.999,
                 epsilon=1e-08,
                 n_context_sample=600):
        self.E = utils.load_pkl(data_path + 'embedding.pkl')
        self.C = utils.load_pkl(data_path + 'softmax_w.pkl')
        self.b = utils.load_pkl(data_path + 'softmax_b.pkl')
        self.V = self.E.shape[0]
        self.K = self.E.shape[1]
        self.V_dash = self.C.shape[0]
        self.data_path = data_path
        self.context_path = context_path
        self.n_context_sample = n_context_sample
        self.scope = 0

        # Load context distribution
        self.context_distribution = utils.load_pkl(context_path +
                                                   'context_distribution.pkl')
        # Check if sample context file exits
        self.sample_contexts_file_name = os.path.join(
            self.context_path,
            'sample_contexts_{}.pkl'.format(n_context_sample))
        self.contexts = utils.load_pkl(self.sample_contexts_file_name)

        # adam optimizer initialize
        self.t = 396893
        self.beta1 = beta1
        self.beta2 = beta2
        self.lr = learning_rate
        self.epsilon = epsilon
        self.beta1_t = beta1**self.t
        self.beta2_t = beta2**self.t

        # initialize things
        self.mE_t = np.zeros((self.V, self.K))
        self.mC_t = np.zeros((self.V_dash, self.K))
        self.mb_t = np.zeros(self.V_dash)
        self.vE_t = np.zeros((self.V, self.K))
        self.vC_t = np.zeros((self.V_dash, self.K))
        self.vb_t = np.zeros(self.V_dash)
示例#9
0
    def __init__(self, config, logger):
        # load parameters
        self.config = config
        self.logger = logger
        self.data_path = config['data_path']

        # load docs
        self.data_dict = load_pkl(self.data_path + '/texts.pkl')
        self.stop_words = load_stop_words(self.data_path + '/cn_stopwords.txt')
        docs = list(self.data_dict.values())

        # topkenize
        self.docs = self._tokenizer(docs)
示例#10
0
    def __init__(self,
                 model_path,
                 sample_path,
                 output_path,
                 context_distribution_file,
                 n_train_sample=1000,
                 n_neg_sample=200,
                 n_context_sample=1000):
        # Load parameters
        self.embedding = utils.load_pkl(model_path +
                                        config['SNML']['embedding'])
        self.softmax_w = utils.load_pkl(model_path +
                                        config['SNML']['softmax_w'])
        self.softmax_b = utils.load_pkl(model_path +
                                        config['SNML']['softmax_b'])
        self.n_vocab = self.embedding.shape[0]
        self.n_embedding = self.embedding.shape[1]
        self.n_context = self.softmax_w.shape[0]
        self.n_neg_sample = n_neg_sample

        # paths
        self.data_path = model_path
        self.sample_path = sample_path
        self.output_path = output_path
        self.n_files = int(config['SNML']['n_files'])

        # sample data
        self.n_train_sample = n_train_sample
        self.words = []
        self.contexts = []
        self.epochs = 0
        self._set_training_sample(20)
        self.sample_contexts, self.sample_contexts_prob = utils.sample_contexts(
            context_distribution_file, n_context_sample)
        self.n_context_sample = n_context_sample

        # set computation
        self._set_computation()
示例#11
0
def evaludate_model(dim):
    kl_list = []
    mae_list = []
    cos_list = []
    rho_list = []
    parent_dir = '../notebooks/output/100-context-500000-data-38-questions/'

    model = Model(parent_dir + '1/full/{}dim/'.format(dim),
                  '../../data/text8/',
                  learning_rate=0.1)

    context_distribution = utils.load_pkl(parent_dir +
                                          'context_distribution.dict')

    for i in range(len(context_distribution)):
        true_dis = context_distribution[i]
        pred_dis = model.get_context_dis(i)

        kl_list.append(kl_divergence(pred_dis, true_dis))
        mae_list.append(mean_absolute_error(pred_dis, true_dis))
        cos_list.append(cos(pred_dis, true_dis))
        rho_list.append(rho(pred_dis, true_dis))

    return np.sum(rho_list)
from utils.tools import load_pkl


def export_embedding(embedding, filename):
    # write embedding result to file
    output = open(filename, 'w', encoding='utf-8')
    for i in range(embedding.shape[0]):
        text = int_to_vocab[i]
        for j in embedding[i]:
            text += ' %f' % j
        text += '\n'
        try:
            output.write(text)
        except:
            print(text)

    output.close()


if __name__ == "__main__":
    dims = [50, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200, 300]
    path = '../../output/wiki/20200126/1/train1/{}dim/step-90/'

    int_to_vocab = load_pkl('../../data/wiki/dict/int_to_vocab.dict', local=True)

    for dim in dims:
        embedding = load_pkl(path.format(dim) + 'embedding.pkl', local=True)
        export_embedding(embedding, path.format(dim) + 'embedding.txt')
示例#13
0
from collections import Counter
from utils.tools import load_pkl, save_pkl
import numpy as np

if __name__ == "__main__":
    # Data file
    raw_data_path = '../data/raw data/test.txt '
    context_to_dict_path = 'data/text8/dict/cont_to_int.dict'
    output_path = 'data/text8/contexts/distribution_from_raw.pkl'
    int_to_cont = load_pkl('data/text8/dict/int_to_cont.dict', local=True)

    # Load data
    with open(raw_data_path, encoding='utf-8') as f:
        words = f.read().split()

    # Load dict
    context_to_dict = load_pkl(context_to_dict_path, local=True)

    # Convert vocab to int
    context = []
    for word in words:
        if word in context_to_dict:
            context.append(context_to_dict[word])

    context_counts = Counter(context)
    n_context = len(context_to_dict)
    n_data = sum(list(context_counts.values()))

    context_distribution = np.zeros(n_context)
    for c, count in context_counts.items():
        context_distribution[c] = count / n_data
示例#14
0
    def __init__(self,
                 input_path,
                 output_path,
                 n_embedding,
                 batch_size,
                 epochs,
                 n_sampled,
                 snml=False,
                 snml_dir='',
                 random_seed=1234):
        self.n_embedding = n_embedding
        self.embedding = np.array([])
        self.data_path = input_path

        # create output directory
        self.output_dictionary = output_path + config['TRAIN'][
            'output_dir'].format(n_embedding)
        if not os.path.exists(self.output_dictionary):
            os.makedirs(self.output_dictionary)

        if snml:
            self.snml_dir = snml_dir + config['TRAIN']['output_dir'].format(
                n_embedding)
            if not os.path.exists(self.snml_dir):
                os.makedirs(self.snml_dir)

        # sync with gcs
        utils.download_from_gcs(input_path + config['TRAIN']['int_to_vocab'])
        utils.download_from_gcs(input_path + config['TRAIN']['vocab_to_int'])
        utils.download_from_gcs(input_path + config['TRAIN']['int_to_cont'])
        utils.download_from_gcs(input_path + config['TRAIN']['train_data'])

        # read dictionaries
        self.int_to_vocab = utils.load_pkl(input_path +
                                           config['TRAIN']['int_to_vocab'])
        self.vocab_to_int = utils.load_pkl(input_path +
                                           config['TRAIN']['vocab_to_int'])
        self.int_to_cont = utils.load_pkl(input_path +
                                          config['TRAIN']['int_to_cont'])
        self.n_vocab = len(self.int_to_vocab)
        self.n_context = len(self.int_to_cont)

        # computation graph
        self.train_graph = tf.Graph()
        self.batch_size = batch_size
        self.epochs = epochs
        self.n_sampled = n_sampled

        # construct computation graph
        if snml:
            filename = self.data_path + config['TRAIN']['train_data_snml']
        else:
            filename = self.data_path + config['TRAIN']['train_data']
        self.snml = snml
        self._set_training_file(filename)
        self._set_computation(filename, random_seed)

        # training data
        self.n_datums = utils.count_line(filename)
        self.n_batches = self.n_datums // self.batch_size

        # evaluator
        self.word_analogy = WordAnalogy(config['TRAIN']['question_file'])
        self.word_analogy.set_top_words(config['TRAIN']['top_word_file'])

        # output file
        self.embedding_file = config['TRAIN']['embedding'].format(
            self.n_embedding, n_sampled, epochs, batch_size)

        print(
            'Initialize Model with: {} samples, trying to run {} batches each epoch.'
            .format(self.n_datums, self.n_batches))
示例#15
0
    def __init__(self, input_path, output_path, n_embedding, batch_size, epochs, n_sampled,
                 snml=False, snml_dir=''):
        self.n_embedding = n_embedding
        self.embedding = np.array([])
        self.data_path = input_path

        # create output directory
        self.output_dictionary = output_path + config['TRAIN']['output_dir'].format(n_embedding)
        if not os.path.exists(self.output_dictionary):
            os.makedirs(self.output_dictionary)

        if snml:
            self.snml_dir = snml_dir + config['TRAIN']['output_dir'].format(n_embedding)
            if not os.path.exists(self.snml_dir):
                os.makedirs(self.snml_dir)

        # sync with gcs
        utils.download_from_gcs(input_path + config['TRAIN']['int_to_vocab'])
        utils.download_from_gcs(input_path + config['TRAIN']['vocab_to_int'])
        utils.download_from_gcs(input_path + config['TRAIN']['int_to_cont'])
        utils.download_from_gcs(input_path + config['TRAIN']['cont_to_int'])
        utils.download_from_gcs(input_path + config['TRAIN']['train_data'])

        # read dictionaries
        self.int_to_vocab = utils.load_pkl(input_path + config['TRAIN']['int_to_vocab'])
        self.vocab_to_int = utils.load_pkl(input_path + config['TRAIN']['vocab_to_int'])
        self.int_to_cont = utils.load_pkl(input_path + config['TRAIN']['int_to_cont'])
        self.cont_to_int = utils.load_pkl(input_path + config['TRAIN']['cont_to_int'])
        self.n_vocab = len(self.int_to_vocab)
        self.n_context = len(self.int_to_cont)

        # computation parameters
        self.batch_size = batch_size
        self.epochs = epochs
        self.n_sampled = n_sampled
        random_seed = int(config['OTHER']['random_seed'])
        np.random.seed(random_seed)  # Set seed for weights
        self.E = [np.random.uniform(-0.8, 0.8, self.n_embedding) for _ in range(self.n_vocab)]
        self.F = np.random.uniform(-0.8, 0.8, (self.n_context, self.n_embedding))
        np.random.seed(random_seed)  # Set seed for anythings else (negative samples)

        # construct computation graph
        if snml:
            filename = self.data_path + config['TRAIN']['train_data_snml']
        else:
            filename = self.data_path + config['TRAIN']['train_data']
        self.snml = snml
        self.filename = filename

        # training data
        self.n_datums = utils.count_line(filename)
        self.n_batches = self.n_datums // self.batch_size

        # Context distribution
        self.context_distribution = utils.load_pkl(self.data_path + config['TRAIN']['context_dist'])
        self.context_distribution = self.context_distribution ** (3/4)
        self.context_distribution = self.context_distribution / sum(self.context_distribution)

        # Context sample look up table
        table_size = 100000000  # Length of the unigram table
        table = np.zeros(table_size, dtype=np.uint32)

        p = 0  # Cumulative probability
        i = 0
        for j in range(self.n_context):
            p += self.context_distribution[j]
            while i < table_size and float(i) / table_size < p:
                table[i] = j
                i += 1
        self.table = table
        self.pos_neg = [1] + [0 for _ in range(self.n_sampled)]

        # evaluator
        self.word_analogy = WordAnalogy(config['TRAIN']['question_file'])
        # self.word_analogy.set_top_words(config['TRAIN']['top_word_file'])

        # output file
        self.embedding_file = config['TRAIN']['embedding'].format(self.n_embedding, n_sampled, epochs, batch_size)

        print('Initialize Model with: {} samples, trying to run {} batches each epoch.'.format(self.n_datums,
                                                                                               self.n_batches))
示例#16
0
    # Set up parameters
    if args.continue_scope == 0:
        args.continue_scope = args.scope

    # read snml train file
    utils.download_from_gcs(args.snml_train_file)
    data = np.genfromtxt(args.snml_train_file, delimiter=',').astype(int)

    # Initialize model
    model = Model(args.model, args.context_path, n_context_sample=args.n_context_sample,
                  learning_rate=args.learning_rate)

    # Continue from previous
    previous_file = args.model + '{}-step/scope-{}-snml_length.pkl'.format(args.continue_from, args.continue_scope)
    if os.path.isfile(previous_file):
        snml_lengths = utils.load_pkl(previous_file)
    else:
        snml_lengths = []

    for i in range(args.continue_from):
        model.train_one_sample(data[i][0], data[i][1], epochs=args.epochs, update_weight=True)
    print('Continue step: {}, from file: {}'.format(args.continue_from, previous_file))

    # Run snml
    print_step = 10
    start = time.time()
    for i in range(args.continue_from, args.scope):
        w = data[i][0]
        c = data[i][1]

        length = model.snml_length_sampling(w, c, epochs=args.epochs)
import os
import utils.tools as utils

if __name__ == "__main__":
    context_path = '../notebooks/output/50-context-500000-data-18-questions/contexts/'
    n_context_sample = 50
    scope = 5000
    file_name = os.path.join(context_path,
                             'sample_contexts_{}.pkl'.format(n_context_sample))

    context_distribution = utils.load_pkl(context_path +
                                          'context_distribution.pkl')
    if os.path.exists(file_name):
        print('Load file')
        contexts = utils.load_pkl(file_name)
    else:
        contexts = []

    print('Current contexts: ', len(contexts))

    # Sample contexts
    if scope + 1 > len(contexts):
        for i in range(scope - len(contexts)):
            samples = utils.sample_context_uniform(len(context_distribution),
                                                   n_context_sample)
            contexts.append(samples)

    # Save result back to pkl
    utils.save_pkl(contexts, file_name)

    print(len(contexts))