示例#1
0
def main(path, min_count):

    # Construct vocabulary.
    vocab = []
    with open(path) as fin:
        for line in fin:
            words = line.strip().split()
            for word in words:
                word = process(word)
                vocab.append(word)
    counter = Counter(vocab)
    vocab = dict((word, count) for word, count in counter.most_common() if count >= min_count)
    with open('vocab.json', 'w') as fp:
        json.dump(vocab, fp, indent=4)
    del counter

    # Get all neccessary substitutions using vocabulary.
    subs = []
    with open(path) as fin:
        for sent_id, line in enumerate(fin, 1):
            words = line.strip().split()
            for word_id, word in enumerate(words, 1):
                processed = process(word)
                if processed not in vocab:
                    unk = unkify(processed, vocab)
                    # subs.append(f'{sent_id} {word_id} {UNK}')
                    subs.append(f'{sent_id} {word_id} {unk}')
                elif not processed == word:
                    subs.append(f'{sent_id} {word_id} {processed}')

    print('\n'.join(subs))
示例#2
0
def _process_tree(line, words, tags=False):
    tokens = line.replace(')', ' )').split()
    nonterminals = []
    new_tokens = []
    pop = False
    ind = 0
    for token in tokens:
        if token.startswith('('):  # open paren
            new_token = token[1:]
            nonterminals.append(new_token)
            new_tokens.append(token)
        elif token == ')':  # close paren
            if pop:  # preterminal
                pop = False
            else:  # nonterminal
                new_token = ')' + nonterminals.pop()
                new_tokens.append(new_token)
        else:  # word
            if not tags:
                tag = '(' + nonterminals.pop()  # pop preterminal
                new_tokens.pop()
                pop = True
            if token.lower() in words:
                new_tokens.append(token.lower())
            else:
                new_tokens.append(unkify(token))
    return ' ' + ' '.join(new_tokens[1:-1]) + ' '
示例#3
0
    def generate(self, num_out=200, prime=None, sample=True):
        state = self.sess.run(self.cell.zero_state(1, tf.float32))

        if prime is None:
            prime = np.random.choice(self.vocab)
        else:
            prime = unkify(prime, self.vocab)

        for word in prime.split():
            print word
            last_word_i = self.vocab.index(word)
            input_i = np.array([[last_word_i]])

            feed_dict = {self.inputs: input_i, self.initial_state: state}
            state = self.sess.run(self.final_state, feed_dict=feed_dict)

        gen_seq = prime
        for i in xrange(num_out):
            input_i = np.array([[last_word_i]])
            feed_dict = {self.inputs: input_i, self.initial_state: state}
            probs, state = self.sess.run(
                [self.probs, self.final_state], feed_dict=feed_dict)
            probs = probs[0]

            if sample:
                gen_word_i = np.random.choice(np.arange(len(probs)), p=probs)
            else:
                gen_word_i = np.argmax(probs)

            gen_word = self.vocab[gen_word_i]
            gen_seq += ' ' + gen_word
            last_word_i = gen_word_i

        return gen_seq
示例#4
0
def ptb_recurse(t, words, forms):
  forms.append('(' + t.label)
  for child in t.subtrees():
    if child.is_preterminal():
      token = child.tokens()[0]
      if token.lower() not in words:
        forms.append(unkify(token))
      else:
        forms.append(token.lower())
    else:
      ptb_recurse(child, words, forms)
  forms.append(')' + t.label)
示例#5
0
    def generate(self, num_out=200, prime=None, sample=True):
        """
        Generate a sequence of text from the trained model.

        @param num_out: The length of the sequence to generate, in num words.
        @param prime: The priming sequence for generation. If None, pick a random word from the
                      vocabulary as prime.
        @param sample: Whether to probabalistically sample the next word, rather than take the word
                       of max probability.
        """
        state = self.sess.run(self.cell.zero_state(1, tf.float32))

        # if no prime supplied, get a random word. Otherwise, translate all words in prime that
        # aren't in dictionary to '*UNK*'
        if prime is None:
            prime = np.random.choice(self.vocab)
        else:
            prime = unkify(prime, self.vocab)

        # prime the model state
        for word in prime.split():
            print word
            last_word_i = self.vocab.index(word)
            input_i = np.array([[last_word_i]])

            feed_dict = {self.inputs: input_i, self.initial_state: state}
            state = self.sess.run(self.final_state, feed_dict=feed_dict)

        # generate the sequence
        gen_seq = prime
        for i in xrange(num_out):
            # generate word probabilities
            input_i = np.array([[last_word_i]])  #TODO: use dictionary?
            feed_dict = {self.inputs: input_i, self.initial_state: state}
            probs, state = self.sess.run([self.probs, self.final_state],
                                         feed_dict=feed_dict)
            probs = probs[0]

            # select index of new word
            if sample:
                gen_word_i = np.random.choice(np.arange(len(probs)), p=probs)
            else:
                gen_word_i = np.argmax(probs)

            # append new word to the generated sequence
            gen_word = self.vocab[gen_word_i]
            gen_seq += ' ' + gen_word
            last_word_i = gen_word_i

        return gen_seq
示例#6
0
    def generate(self, num_out=200, prime=None, sample=True):
        """
        Generate a sequence of text from the trained model.

        @param num_out: The length of the sequence to generate, in num words.
        @param prime: The priming sequence for generation. If None, pick a random word from the
                      vocabulary as prime.
        @param sample: Whether to probabalistically sample the next word, rather than take the word
                       of max probability.
        """
        state = self.sess.run(self.cell.zero_state(1, tf.float32))

        if prime is None:
            prime = np.random.choice(self.vocab)
        else:
            prime = unkify(prime, self.vocab)

        for word in prime.split():
            print word
            last_word_i = self.vocab.index(word)
            input_i = np.array([[last_word_i]])

            feed_dict = {self.inputs: input_i, self.initial_state: state}
            state = self.sess.run(self.final_state, feed_dict=feed_dict)

        gen_seq = prime
        for i in xrange(num_out):

            input_i = np.array([[last_word_i]])
            feed_dict = {self.inputs: input_i, self.initial_state: state}
            probs, state = self.sess.run([self.probs, self.final_state],
                                         feed_dict=feed_dict)
            probs = probs[0]

            if sample:
                gen_word_i = np.random.choice(np.arange(len(probs)), p=probs)
            else:
                gen_word_i = np.argmax(probs)

            gen_word = self.vocab[gen_word_i]
            gen_seq += ' ' + gen_word
            last_word_i = gen_word_i

        return gen_seq