def main(): parser = argparse.ArgumentParser() parser.add_argument('input_file', type=argparse.FileType()) parser.add_argument('output_file', type=argparse.FileType('w')) parser.add_argument('vector_size', type=int) parser.add_argument('context_size', type=int) parser.add_argument('vocabulary_size', type=int) args = parser.parse_args() sentences = list(lower(tokenize(args.input_file))) dictionary = build_dictionary(sentences, args.vocabulary_size) indices = to_indices(sentences, dictionary) inputs, outputs = create_context(indices, args.context_size) cost_gradient = bind_cost_gradient(skip_gram_cost_gradient, inputs, outputs, sampler=get_stochastic_sampler(100)) initial_parameters = np.random.normal(size=(2, len(dictionary) + 1, args.vector_size)) parameters, cost_history = gradient_descent(cost_gradient, initial_parameters, 10000) input_vectors, output_vectors = parameters word_vectors = input_vectors + output_vectors sorted_pairs = sorted(dictionary.items(), key=operator.itemgetter(1)) words = [word for word, index in sorted_pairs] for word in words: vector = word_vectors[dictionary[word]] vector_string = ' '.join(str(element) for element in vector) print(word, vector_string, file=args.output_file)
def train(self, sentences, iterations=1000): # Preprocess sentences to create indices of context and next words self.dictionary = build_dictionary(sentences, self.vocabulary_size) indices = to_indices(sentences, self.dictionary) self.reverse_dictionary = { index: word for word, index in self.dictionary.items() } inputs, outputs = self.create_context(indices) # Create cost and gradient function for gradient descent shapes = [self.W_shape, self.U_shape, self.H_shape, self.C_shape] flatten_nplm_cost_gradient = flatten_cost_gradient( nplm_cost_gradient, shapes) cost_gradient = bind_cost_gradient(flatten_nplm_cost_gradient, inputs, outputs, sampler=get_stochastic_sampler(10)) # Train neural network parameters_size = np.sum(np.product(shape) for shape in shapes) initial_parameters = np.random.normal(size=parameters_size) self.parameters, cost_history = gradient_descent( cost_gradient, initial_parameters, iterations) return cost_history
def train(self, sentences, iterations=1000): # Preprocess sentences to create indices of context and next words self.dictionary = build_dictionary(sentences, self.vocabulary_size) indices = to_indices(sentences, self.dictionary) self.reverse_dictionary = {index: word for word, index in self.dictionary.items()} inputs, outputs = self.create_context(indices) # Create cost and gradient function for gradient descent shapes = [self.W_shape, self.U_shape, self.H_shape, self.C_shape] flatten_nplm_cost_gradient = flatten_cost_gradient(nplm_cost_gradient, shapes) cost_gradient = bind_cost_gradient(flatten_nplm_cost_gradient, inputs, outputs, sampler=get_stochastic_sampler(10)) # Train neural network parameters_size = np.sum(np.product(shape) for shape in shapes) initial_parameters = np.random.normal(size=parameters_size) self.parameters, cost_history = gradient_descent(cost_gradient, initial_parameters, iterations) return cost_history
def test_to_indices(self): sentences = [['a', 'man', 'a', 'woman'], ['a', 'man']] dictionary = build_dictionary(sentences, 2) actual = list(to_indices(sentences, dictionary)) expected = [[1, 2, 1, 0], [1, 2]] self.assertSequenceEqual(expected, actual)
def test_build_dictionary(self): sentences = [['a', 'man', 'a', 'woman'], ['a', 'man']] actual = build_dictionary(sentences, 2) expected = {'a': 1, 'man': 2} self.assertDictEqual(expected, actual)