def main(path, min_count): # Construct vocabulary. vocab = [] with open(path) as fin: for line in fin: words = line.strip().split() for word in words: word = process(word) vocab.append(word) counter = Counter(vocab) vocab = dict((word, count) for word, count in counter.most_common() if count >= min_count) with open('vocab.json', 'w') as fp: json.dump(vocab, fp, indent=4) del counter # Get all neccessary substitutions using vocabulary. subs = [] with open(path) as fin: for sent_id, line in enumerate(fin, 1): words = line.strip().split() for word_id, word in enumerate(words, 1): processed = process(word) if processed not in vocab: unk = unkify(processed, vocab) # subs.append(f'{sent_id} {word_id} {UNK}') subs.append(f'{sent_id} {word_id} {unk}') elif not processed == word: subs.append(f'{sent_id} {word_id} {processed}') print('\n'.join(subs))
def _process_tree(line, words, tags=False): tokens = line.replace(')', ' )').split() nonterminals = [] new_tokens = [] pop = False ind = 0 for token in tokens: if token.startswith('('): # open paren new_token = token[1:] nonterminals.append(new_token) new_tokens.append(token) elif token == ')': # close paren if pop: # preterminal pop = False else: # nonterminal new_token = ')' + nonterminals.pop() new_tokens.append(new_token) else: # word if not tags: tag = '(' + nonterminals.pop() # pop preterminal new_tokens.pop() pop = True if token.lower() in words: new_tokens.append(token.lower()) else: new_tokens.append(unkify(token)) return ' ' + ' '.join(new_tokens[1:-1]) + ' '
def generate(self, num_out=200, prime=None, sample=True): state = self.sess.run(self.cell.zero_state(1, tf.float32)) if prime is None: prime = np.random.choice(self.vocab) else: prime = unkify(prime, self.vocab) for word in prime.split(): print word last_word_i = self.vocab.index(word) input_i = np.array([[last_word_i]]) feed_dict = {self.inputs: input_i, self.initial_state: state} state = self.sess.run(self.final_state, feed_dict=feed_dict) gen_seq = prime for i in xrange(num_out): input_i = np.array([[last_word_i]]) feed_dict = {self.inputs: input_i, self.initial_state: state} probs, state = self.sess.run( [self.probs, self.final_state], feed_dict=feed_dict) probs = probs[0] if sample: gen_word_i = np.random.choice(np.arange(len(probs)), p=probs) else: gen_word_i = np.argmax(probs) gen_word = self.vocab[gen_word_i] gen_seq += ' ' + gen_word last_word_i = gen_word_i return gen_seq
def ptb_recurse(t, words, forms): forms.append('(' + t.label) for child in t.subtrees(): if child.is_preterminal(): token = child.tokens()[0] if token.lower() not in words: forms.append(unkify(token)) else: forms.append(token.lower()) else: ptb_recurse(child, words, forms) forms.append(')' + t.label)
def generate(self, num_out=200, prime=None, sample=True): """ Generate a sequence of text from the trained model. @param num_out: The length of the sequence to generate, in num words. @param prime: The priming sequence for generation. If None, pick a random word from the vocabulary as prime. @param sample: Whether to probabalistically sample the next word, rather than take the word of max probability. """ state = self.sess.run(self.cell.zero_state(1, tf.float32)) # if no prime supplied, get a random word. Otherwise, translate all words in prime that # aren't in dictionary to '*UNK*' if prime is None: prime = np.random.choice(self.vocab) else: prime = unkify(prime, self.vocab) # prime the model state for word in prime.split(): print word last_word_i = self.vocab.index(word) input_i = np.array([[last_word_i]]) feed_dict = {self.inputs: input_i, self.initial_state: state} state = self.sess.run(self.final_state, feed_dict=feed_dict) # generate the sequence gen_seq = prime for i in xrange(num_out): # generate word probabilities input_i = np.array([[last_word_i]]) #TODO: use dictionary? feed_dict = {self.inputs: input_i, self.initial_state: state} probs, state = self.sess.run([self.probs, self.final_state], feed_dict=feed_dict) probs = probs[0] # select index of new word if sample: gen_word_i = np.random.choice(np.arange(len(probs)), p=probs) else: gen_word_i = np.argmax(probs) # append new word to the generated sequence gen_word = self.vocab[gen_word_i] gen_seq += ' ' + gen_word last_word_i = gen_word_i return gen_seq
def generate(self, num_out=200, prime=None, sample=True): """ Generate a sequence of text from the trained model. @param num_out: The length of the sequence to generate, in num words. @param prime: The priming sequence for generation. If None, pick a random word from the vocabulary as prime. @param sample: Whether to probabalistically sample the next word, rather than take the word of max probability. """ state = self.sess.run(self.cell.zero_state(1, tf.float32)) if prime is None: prime = np.random.choice(self.vocab) else: prime = unkify(prime, self.vocab) for word in prime.split(): print word last_word_i = self.vocab.index(word) input_i = np.array([[last_word_i]]) feed_dict = {self.inputs: input_i, self.initial_state: state} state = self.sess.run(self.final_state, feed_dict=feed_dict) gen_seq = prime for i in xrange(num_out): input_i = np.array([[last_word_i]]) feed_dict = {self.inputs: input_i, self.initial_state: state} probs, state = self.sess.run([self.probs, self.final_state], feed_dict=feed_dict) probs = probs[0] if sample: gen_word_i = np.random.choice(np.arange(len(probs)), p=probs) else: gen_word_i = np.argmax(probs) gen_word = self.vocab[gen_word_i] gen_seq += ' ' + gen_word last_word_i = gen_word_i return gen_seq