def generate_memnet_vocabulary(): data = [] with open('holle/memnet_data/train_data.json', 'r') as memnet_train_file: data += json.load(memnet_train_file) with open('holle/memnet_data/dev_data.json', 'r') as memnet_dev_file: data += json.load(memnet_dev_file) with open('holle/memnet_data/test_data.json', 'r') as memnet_test_file: data += json.load(memnet_test_file) docs = [] for row in data: context_lst = row['context'] fact_lst = row['facts'] response = row['response'] docs += context_lst + fact_lst + [response] vocab = Vocabulary.from_documents(docs) with open('memnet_data/vocab.pkl', 'wb') as vocab_file: pickle.dump(vocab, vocab_file)
if __name__ == '__main__': arguments = parse_args() logger.info('Loading config') with open(arguments.config) as config_file: config = yaml.load(config_file) logger.info('Initializing input stream') input_stream = LineSentence( arguments.corpus, max_sentence_length=config['sliding_window']['change_every_words'] ) min_word_freq = config['vocabulary']['min_freq'] logger.info('Building vocabulary with min_freq={}'.format(min_word_freq)) vocab = Vocabulary.from_documents(input_stream, min_word_freq) vocabulary_size = len(vocab) logger.info('Vocabulary size: {}'.format(vocabulary_size)) logger.info('Building negative sampling distribution') negative_sampler = HierarchicalSampler( vocab=vocab, alpha=config['negative_sampling']['alpha'], chunks_num=config['negative_sampling']['vocab_chunks_num'] ) logger.info('Building model computation graph') optimizer = tf.train.AdagradOptimizer( learning_rate=config['training_params']['initial_learning_rate'] )
if __name__ == '__main__': arguments = parse_args() logger.info('Loading config') with open(arguments.config) as config_file: config = yaml.load(config_file) logger.info('Initializing input stream') input_stream = LineSentence( arguments.corpus, max_sentence_length=config['sliding_window']['change_every_words']) min_word_freq = config['vocabulary']['min_freq'] logger.info('Building vocabulary with min_freq={}'.format(min_word_freq)) vocab = Vocabulary.from_documents(input_stream, min_word_freq) vocabulary_size = len(vocab) logger.info('Vocabulary size: {}'.format(vocabulary_size)) logger.info('Building negative sampling distribution') negative_sampler = HierarchicalSampler( vocab=vocab, alpha=config['negative_sampling']['alpha'], chunks_num=config['negative_sampling']['vocab_chunks_num']) logger.info('Building model computation graph') optimizer = tf.train.AdagradOptimizer( learning_rate=config['training_params']['initial_learning_rate']) negative_samples_num = config['sliding_window']['max_size'] * \