def build_vocabulary(): if not os.path.exists(config.TRAIN_FILE): print("Error: can't find train file '%s'!" % config.TRAIN_FILE) exit() if config.INCLUDE_UNKNOWN_WORD: vocabulary.add(config.UNKNOWN_WORD, config.WORD_COUNT + 1) vocabulary.add(config.SYMBOL_WORD) vocabulary.add(config.PADDING_WORD, config.WORD_COUNT + 1) train_size = 0 with open(config.TRAIN_FILE, 'r') as f: for line in f: line = line.strip('\n') words = line.split() for word in words: word = getNormalWord(word) if word: train_size += 1 vocabulary.add(word) vocabulary.delete_word() vocabulary.save_vocabulary() print("TRAIN SIZE: %d" % train_size) print("VOCABULARY SIZE: %d" % vocabulary.length()) vocabulary.dump_vocabulary()
def __init__(self, window_size=config.WINDOW_SIZE, embedding_size=config.EMBEDDING_SIZE, hidden_size=config.HIDDEN_SIZE): """ Initialize L{Model} parameters. """ self.vocab_size = vocabulary.length() self.window_size = window_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = 1 self.input_size = self.embedding_size * self.window_size numpy.random.seed() self.embeddings = ( numpy.random.rand(self.vocab_size, self.embedding_size) - 0.5) * 2 if config.NORMALIZE_EMBEDDINGS: self.normalize(range(self.vocab_size)) self.hidden_weights = random_weights( self.input_size, self.hidden_size, scale_by=config.SCALE_INITIAL_WEIGHTS_BY) self.output_weights = random_weights( self.hidden_size, self.output_size, scale_by=config.SCALE_INITIAL_WEIGHTS_BY) self.hidden_biases = numpy.zeros((1, self.hidden_size)) self.output_biases = numpy.zeros((1, self.output_size))
def __init__(self, window_size=config.WINDOW_SIZE, embedding_size=config.EMBEDDING_SIZE, hidden_size=config.HIDDEN_SIZE): """ Initialize L{Model} parameters. """ self.vocab_size = vocabulary.length() self.window_size = window_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = 1 self.input_size = self.embedding_size * self.window_size numpy.random.seed() # self.embeddings = numpy.asarray( # (numpy.random.rand(self.vocab_size, self.embedding_size) - 0.5) * 2 * config.INITIAL_EMBEDDING_RANGE, # dtype=floatX) self.embeddings = numpy.asarray((numpy.random.rand(self.vocab_size, self.embedding_size) - 0.5) * 2, dtype=floatX) if config.NORMALIZE_EMBEDDINGS: self.normalize(range(self.vocab_size)) self.hidden_weights = theano.shared(random_weights(self.input_size, self.hidden_size)) self.output_weights = theano.shared(random_weights(self.hidden_size, self.output_size)) # self.hidden_weights = theano.shared(numpy.asarray(numpy.ones((self.input_size, self.hidden_size)), # dtype=floatX)) # self.output_weights = theano.shared(numpy.asarray( # random_weights(self.hidden_size, self.output_size, scale_by=config.SCALE_INITIAL_WEIGHTS_BY), dtype=floatX)) self.hidden_biases = theano.shared(numpy.asarray(numpy.zeros((self.hidden_size,)), dtype=floatX)) self.output_biases = theano.shared(numpy.asarray(numpy.zeros((self.output_size,)), dtype=floatX))
def __init__( self, window_size=config.WINDOW_SIZE, embedding_size=config.EMBEDDING_SIZE, hidden_size=config.HIDDEN_SIZE ): """ Initialize L{Model} parameters. """ self.vocab_size = vocabulary.length() self.window_size = window_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.output_size = 1 self.input_size = self.embedding_size * self.window_size numpy.random.seed() self.embeddings = (numpy.random.rand(self.vocab_size, self.embedding_size) - 0.5) * 2 if config.NORMALIZE_EMBEDDINGS: self.normalize(range(self.vocab_size)) self.hidden_weights = random_weights( self.input_size, self.hidden_size, scale_by=config.SCALE_INITIAL_WEIGHTS_BY ) self.output_weights = random_weights( self.hidden_size, self.output_size, scale_by=config.SCALE_INITIAL_WEIGHTS_BY ) self.hidden_biases = numpy.zeros((1, self.hidden_size)) self.output_biases = numpy.zeros((1, self.output_size))
def build_samples(): if not vocabulary.length(): vocabulary.load_vocabulary() sample_file = open(config.SAMPLE_FILE, 'w') for line in open(config.TRAIN_FILE, 'r'): line = line.strip('\n') words = [getNormalWord(word) for word in line.split() if word] word_ids = [vocabulary.id(word) for word in words] sent_length = len(word_ids) half_window = config.WINDOW_SIZE / 2 window = [] padding_id = vocabulary.id(config.PADDING_WORD) unknown_id = vocabulary.id(config.UNKNOWN_WORD) symbol_id = vocabulary.id(config.SYMBOL_WORD) for index, word_id in enumerate(word_ids): if word_id == unknown_id: continue if index - half_window >= 0 and index + half_window < sent_length: window = word_ids[index - half_window:index + half_window + 1] if window.count(unknown_id) + window.count( symbol_id) + window.count(padding_id) <= half_window: sample_file.write(' '.join([str(id) for id in window]) + '\n') window = [] continue if index - half_window < 0: for i in range(half_window - index): window.append(padding_id) window.extend(word_ids[:index + 1]) else: window.extend(word_ids[index - half_window:index + 1]) if index + half_window >= sent_length: window.extend(word_ids[index + 1:]) for i in range(index + half_window - sent_length + 1): window.append(padding_id) else: window.extend(word_ids[index + 1:index + half_window + 1]) if window.count(unknown_id) + window.count( symbol_id) + window.count(padding_id) <= half_window: sample_file.write(' '.join([str(id) for id in window]) + '\n') window = [] sample_file.close()
def build_samples(): if not vocabulary.length(): vocabulary.load_vocabulary() sample_file = open(config.SAMPLE_FILE, 'w') for line in open(config.TRAIN_FILE, 'r'): line = line.strip('\n') words = [getNormalWord(word) for word in line.split() if word] word_ids = [vocabulary.id(word) for word in words] sent_length = len(word_ids) half_window = config.WINDOW_SIZE / 2 window = [] padding_id = vocabulary.id(config.PADDING_WORD) unknown_id = vocabulary.id(config.UNKNOWN_WORD) symbol_id = vocabulary.id(config.SYMBOL_WORD) for index, word_id in enumerate(word_ids): if word_id == unknown_id: continue if index - half_window >= 0 and index + half_window < sent_length: window = word_ids[index - half_window : index + half_window + 1] if window.count(unknown_id) + window.count(symbol_id) + window.count(padding_id) <= half_window: sample_file.write(' '.join([str(id) for id in window]) + '\n') window = [] continue if index - half_window < 0: for i in range(half_window - index): window.append(padding_id) window.extend(word_ids[:index + 1]) else: window.extend(word_ids[index - half_window : index + 1]) if index + half_window >= sent_length: window.extend(word_ids[index + 1:]) for i in range(index + half_window - sent_length + 1): window.append(padding_id) else: window.extend(word_ids[index + 1 : index + half_window + 1]) if window.count(unknown_id) + window.count(symbol_id) + window.count(padding_id) <= half_window: sample_file.write(' '.join([str(id) for id in window]) + '\n') window = [] sample_file.close()