Exemplo n.º 1
0
class Corpus(object):

    def __init__(self):

        self.documents = []
        self.vocab = Vocabulary()
        self.frozen = False

    def add(self, name, tokens):

        if not self.frozen:
            w = [self.vocab[x] for x in tokens]
            self.documents.append(Document(self, name, w))

    def freeze(self):

        for doc in self.documents:
            doc.freeze()

        self.vocab.stop_growth()
        self.frozen = True

    def __iter__(self):
        return iter(self.documents)

    def __len__(self):
        return len(self.documents)

    @classmethod
    def load(cls, filename):
        return pickle.load(file(filename, 'r'))

    def save(self, filename):
        pickle.dump(self, file(filename, 'wb'))
Exemplo n.º 2
0
 def cleanUpText(self, text):
     cleanedWords = []
     # perform lowercase
     words = text.lower().split(' ')
     # get vocabulary
     vocab = Vocabulary()
     for word in words:
         # check Portuguese stopwords
         # TODO: Implement other languages tokenizers
         if not (word in vocab.getPTStopWords()):
             cleanedWords.append(word)
     return cleanedWords
Exemplo n.º 3
0
    def __extract_vocabularies_from_data(self, classes):
        vocabularies = set()
        for c in classes:
            strings = self.__access_strings(c, '/train')
            vocabulary = Vocabulary(strings)

            curr_vocabulary = vocabulary.get_vocabulary()

            self.__write_vocabulary(c, curr_vocabulary)

            vocabularies |= curr_vocabulary #append set

        return sorted(vocabularies)
Exemplo n.º 4
0
class Corpus(object):
    def __init__(self, documents=None, vocab=None, frozen=None):

        if documents:
            self.documents = documents
        else:
            self.documents = []

        if vocab:
            self.vocab = vocab
        else:
            self.vocab = Vocabulary()

        if frozen:
            self.frozen = frozen
        else:
            self.frozen = False

    def add(self, name, tokens):

        if not self.frozen:
            w = [self.vocab[x] for x in tokens]
            self.documents.append(Document(self, name, w))

    def freeze(self):

        for doc in self.documents:
            doc.freeze()

        self.vocab.stop_growth()
        self.frozen = True

    def __getitem__(self, i):
        return self.documents[i]

    def __getslice__(self, i, j):
        return Corpus(self.documents[i:j], self.vocab, self.frozen)

    def __iter__(self):
        return iter(self.documents)

    def __len__(self):
        return len(self.documents)

    @classmethod
    def load(cls, filename):
        return pickle.load(file(filename, "r"))

    def save(self, filename):
        pickle.dump(self, file(filename, "wb"))
class VocabularyTest(unittest.TestCase):

    def setUp(self):
        self.vocabulary = Vocabulary()
        self.vocabulary.load('testdata/vocabulary.dat', 'testdata/custom_words')

        pprint.pprint(self.vocabulary.trie)
        pprint.pprint(self.vocabulary.words)

    def test_vocabulary(self):
        self.assertIn(u'英雄三国', self.vocabulary.words.keys())
        self.assertIn(u'魔鬼代言人', self.vocabulary.words.keys())
        self.assertIn(u'黄河水利委员会', self.vocabulary.words.keys())
        self.assertNotIn(u'十大伪歌手', self.vocabulary.words.keys())
        self.assertNotIn(u'走路太牛', self.vocabulary.words.keys())

        self.assertEqual('n', self.vocabulary.get_pos(u'英雄三国'))
        self.assertEqual('n', self.vocabulary.get_pos(u'魔鬼代言人'))
        self.assertEqual('nt', self.vocabulary.get_pos(u'黄河水利委员会'))
        self.assertEqual('UNK', self.vocabulary.get_pos(u'十大伪歌手'))
        self.assertEqual('UNK', self.vocabulary.get_pos(u'走路太牛'))

    def test_gen_DAG(self):
        pprint.pprint(self.vocabulary.gen_DAG(
            u'《英雄三国》是由网易历时四年自主研发运营的一款英雄对战竞技网游。'))
 def setUp(self):
     self.vocabulary = Vocabulary()
     self.vocabulary.load('../data/vocabulary.dat')
     self.hmm_segmenter = HMMSegmenter()
     self.hmm_segmenter.load('../data/hmm_segment_model')
     self.max_prob_segmenter = MaxProbSegmenter(
             self.vocabulary, self.hmm_segmenter)
Exemplo n.º 7
0
 def test_pronunciation_valid_phrase(self):
     current_result = vb.pronunciation("hippopotamus")
     result = '[{"rawType": "ahd-legacy", "raw": "(hĭpˌə-pŏtˈə-məs)", "seq": 0}, {"rawType": "arpabet", "raw": "HH IH2 P AH0 P AA1 T AH0 M AH0 S", "seq": 0}]'
     expected_result = json.loads(result)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)
Exemplo n.º 8
0
 def test_antonym_valid_phrase_2(self):
     current_result = vb.antonym("respect")
     result = '{"text": ["disesteem", "disrespect"]}'
     expected_result = json.loads(result)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)
Exemplo n.º 9
0
 def test_translate_valid_phrase(self):
     current_result = vb.translate("hummus", "en", "es")
     result = '[{"text": "hummus", "seq": 0}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)
Exemplo n.º 10
0
 def test_partOfSpeech_valid_phrase_2(self):
     current_result = vb.part_of_speech("rapidly")
     result = '[{"text": "adverb", "example:": "With speed; in a rapid manner.", "seq": 0}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)
Exemplo n.º 11
0
 def test_partOfSpeech_valid_phrase_1(self):
     current_result = vb.part_of_speech("hello")
     result = '[{"text": "interjection", "example:": "Used to greet someone, answer the telephone, or express surprise.", "seq": 0}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)
Exemplo n.º 12
0
 def test_hyphenation_valid_phrase(self):
     current_result = vb.hyphenation("hippopotamus")
     result = '[{"seq": 0, "text": "hip", "type": "secondary stress"}, {"seq": 1, "text": "po"}, {"seq": 2, "text": "pot", "type": "stress"}, {"seq": 3, "text": "a"}, {"seq": 4, "text": "mus"}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)
Exemplo n.º 13
0
 def test_usageExamples_valid_phrase(self):
     current_result = vb.usage_example("hillock")
     result = '[{"seq": 0, "text": "I went to the to of the hillock to look around."}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)
Exemplo n.º 14
0
 def test_synonym_valid_phrase(self):
     current_result = vb.synonym("repudiate")
     result = '[{"seq": 0, "text": "deny"}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)
Exemplo n.º 15
0
def synonyms(word):
   try: 
    synonyms=''
    result=json.loads(vb.synonym(word))
    for res in result:
      synonyms += res['text'] + ','
    return synonyms[:-1] + '\n'
   except:
     return "N/A"   
Exemplo n.º 16
0
def meaning(word):
   try: 
    parts=''
    result=json.loads(vb.part_of_speech(word))
    for res in result:
        parts += res['text']+ ':' + res[u'example:'] + '\n\n'
    return parts
   except:
    return "N/A"   
Exemplo n.º 17
0
def translate(text):
   try: 
    translation='' 
    result=json.loads(vb.translate(text, "en","hi"))
    for res in result:
      translation += res['text'] + ','
    return translation[:-1] + '\n'
   except:
    return "N/A"   
Exemplo n.º 18
0
def main():
    """."""
    from vocabulary import Vocabulary
    from attribute import Attribute
    from attribute_structure import AttributeStructure
    from attribute_system import AttributeSystem

    vocabulary = Vocabulary(['C'], [], ['V'])

    a = Attribute("a", [])
    b = Attribute("b", [])
    astr = AttributeStructure(a, b)
    objs = ['a', 'b', 'c']
    attribute_system = AttributeSystem(astr, objs)

    C = ConstantAssignment(vocabulary, attribute_system, {'C': 'a'})
    print C._vocabulary
    vocabulary.add_constant("C2")
    print C._vocabulary
Exemplo n.º 19
0
def get_example(word):
    try:
        examples = json.loads(vb.usage_example(word))
        example = ''
        limit = min(3, len(examples))
        for i in range(limit):
            example += examples[i]['text']+'...'    
        return example
    except Exception, e:
        print e,'\nFlag example'
        return ""
Exemplo n.º 20
0
    def setUp(self):
        self.document = Document(20)
        self.vocabulary = Vocabulary()
        self.vocabulary.load("../testdata/vocabulary.dat")

        self.model = Model(20)
        self.model.load('../testdata/lda_model')

        self.doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
                'mac os x', 'chrome',  # only exist in vocabulary
                'nokia', 'null']  # inexistent
class MaxProbSegmenterTest(unittest.TestCase):

    def setUp(self):
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../data/vocabulary.dat')
        self.hmm_segmenter = HMMSegmenter()
        self.hmm_segmenter.load('../data/hmm_segment_model')
        self.max_prob_segmenter = MaxProbSegmenter(
                self.vocabulary, self.hmm_segmenter)

    def call_segment(self, text):
        for word in self.max_prob_segmenter.segment(text):
            print word + '/\t',
        print ''

    def test_segment(self):
        fp = open('testdata/document.dat', 'rb')
        for text in fp.readlines():
            self.call_segment(text.strip())
        fp.close()
Exemplo n.º 22
0
def usage_example(word):
   try: 
    examples=''
    result=json.loads(vb.usage_example(word))
    for res in result: 
      examples += res['text'] + '\n\n'
    if(len<300):  
      return examples
    else:
      return examples[:300]
   except:
     return "N/A"  
Exemplo n.º 23
0
def get_meaning(word):
    try:
        meaning = json.loads(vb.meaning(word))
        means = ''
        limit = min(3, len(meaning))
        for i in range(limit):
            means += meaning[i]['text'] + ';'

        return means
    except Exception, e:
        print e
        return ""
Exemplo n.º 24
0
    def __init__(self, args, src_file, trg_file):

        self.src_vocabulary = Vocabulary()
        self.src_vocabulary.make_dictionary(src_file)
        self.trg_vocabulary = Vocabulary()
        self.trg_vocabulary.make_dictionary(trg_file)

        self.src_size = len(self.src_vocabulary.wtoi)
        self.embed_size = args.embed_size
        self.hidden_size = args.hidden_size
        self.trg_size = len(self.trg_vocabulary.wtoi)

        super(EncoderDecoder, self).__init__(
            # encoder
            w_xe=F.EmbedID(self.src_size, self.embed_size),
            w_ep=F.Linear(self.embed_size, self.hidden_size*4),
            w_pp=F.Linear(self.hidden_size, self.hidden_size*4),
            # decoder
            w_ey=F.EmbedID(self.trg_size, self.embed_size),
            w_qe=F.Linear(self.embed_size, self.hidden_size*4),
            w_qq=F.Linear(self.hidden_size, self.hidden_size*4),
            w_yq=F.Linear(self.hidden_size, self.trg_size),
        )
Exemplo n.º 25
0
    def get_context(text):
        """
        Try to get context for card
        :param card:
        """

        try:
            m = json.loads(vb.usage_example(text))
            if len(m) > 0:
                return m[0]['text']
            return u''
        except Exception as ex:
            error(u'', ex)
            return u''
Exemplo n.º 26
0
 def get_meaning(text, lang):
     """
     Try to get meaning for card
     :param text:
     :param lang:
     :return:
     """
     try:
         m = json.loads(vb.meaning(text, lang, lang))
         if len(m) > 0:
             return m[0]['text']
         return u''
     except Exception as ex:
         error(u'', ex)
         return u''
Exemplo n.º 27
0
def generate_dataset(items, slots, voca: Vocabulary):
    dataset = Dataset()
    for item in items:
        vectors = []
        for word in item[0].split():
            vectors.append(voca.get(word))

        labels = []
        for tag in item[1].split():
            value = np.zeros([len(slots)], dtype=np.float32)
            value[slots.index(tag)] = 1
            labels.append(value)

        dataset.add(item[0], item[1], vectors, labels)

    return dataset
Exemplo n.º 28
0
    def __init__(self, documents=None, vocab=None, frozen=None):

        if documents:
            self.documents = documents
        else:
            self.documents = []

        if vocab:
            self.vocab = vocab
        else:
            self.vocab = Vocabulary()

        if frozen:
            self.frozen = frozen
        else:
            self.frozen = False
Exemplo n.º 29
0
    def open(self, corpus_dir):
        self.root_dir = corpus_dir
        if not path.isdir(corpus_dir):
            os.mkdir(corpus_dir)

        self.meta_dir = self.root_dir + "/meta"

        self.samples_dir = self.root_dir + "/samples"
        if not path.isdir(self.samples_dir):
            os.mkdir(self.samples_dir)

        self.vocabulary_dir = self.root_dir + "/vocabulary"
        self.vocabulary = Vocabulary(self.vocabulary_dir)

        self.categories_dir = self.root_dir + "/categories"
        self.categories = Categories(self.categories_dir)
        self.categories.load_categories()
        self.categories.print_categories()
Exemplo n.º 30
0
    def test_meaning_valid_phrase(self):
        current_result = vb.meaning("humming")
        result = '[{"seq": 0, "text": "Present participle of hum."}]'
        middle_val = json.loads(result)
        expected_result = json.dumps(middle_val)
        if sys.version_info[:2] <= (2, 7):  ## python 2 
            self.assertItemsEqual(current_result, expected_result)
        else:       # python 3
            """
            assertItemsEqual() was renamed to assertCountEqual() 
            Why I am not using assertEqual() here? 

            Reference: 
            - http://stackoverflow.com/a/7473137/3834059
            - https://docs.python.org/2/library/unittest.html#unittest.TestCase.assertItemsEqual
            - https://docs.python.org/3/library/unittest.html?highlight=assertcountequal#unittest.TestCase.assertCountEqual
            """

            self.assertCountEqual(current_result, expected_result)
Exemplo n.º 31
0
def main():
    os.makedirs(os.path.join(args.logdir, 'models'))

    vocab = Vocabulary(os.path.join(args.wiki_preprocess, 'entity_vocab.txt'))
    print(f"# entity in dataset: {len(vocab)}")

    if not os.path.exists(args.cache):
        STOPWORD_PATH = os.path.join(args.dataroot, "previous/stopwords.txt")
        SYMBOL_PATH = os.path.join(args.dataroot, "previous/symbols.txt")
        with open(STOPWORD_PATH, 'r') as f:
            stop_words = set([line.strip() for line in f])
        with open(SYMBOL_PATH, 'r') as f:
            symbols = set([line.strip() for line in f])
        stop_words = stop_words.union(symbols)

        # Pre-trained word embedding
        wiki2vec = Wikipedia2Vec.load(args.wiki2vec)

        context_entity_word_co_occur_path = os.path.join(
            args.wiki_preprocess, 'context_entity_word_co_occur.txt')
        context_positive_words = filter_positive_words(
            context_entity_word_co_occur_path,
            stop_words,
            wiki2vec,
            vocab,
        )

        page_entity_word_co_occur_path = os.path.join(
            args.wiki_preprocess, 'page_entity_word_co_occur.txt')
        page_positive_words = filter_positive_words(
            page_entity_word_co_occur_path,
            stop_words,
            wiki2vec,
            vocab,
        )

        word_count_path = os.path.join(args.wiki_preprocess, 'word_count.json')
        negative_words, negative_freqs = \
            filter_negative_words(
                word_count_path,
                stop_words,
                wiki2vec,
                freq_power=0.6
            )
        (page_positive_words, context_positive_words, negative_words,
         vecs) = get_reduced_embedding(page_positive_words,
                                       context_positive_words, negative_words,
                                       wiki2vec)
        del wiki2vec

        os.makedirs(os.path.dirname(args.cache), exist_ok=True)
        pickle.dump((page_positive_words, context_positive_words,
                     negative_words, negative_freqs, vecs),
                    open(args.cache, 'wb'))
    else:
        print(f"Load cache {args.cache}")
        (page_positive_words, context_positive_words, negative_words,
         negative_freqs, vecs) = pickle.load(open(args.cache, 'rb'))

    page_non_empty_index = [
        i for i, positive_words in enumerate(page_positive_words)
        if len(positive_words) != 0
    ]
    context_non_empty_index = [
        i for i, positive_words in enumerate(context_positive_words)
        if len(positive_words) != 0
    ]
    non_empty_index = set(page_non_empty_index + context_non_empty_index)
    print(f'# entity in vocab  : {len(vocab) - 1:d}')
    print(f'# non empty page   : {len(page_non_empty_index):d}')
    print(f'# non empty context: {len(context_non_empty_index):d}')
    print(f'# non empty        : {len(non_empty_index):d}')

    word_embedding = nn.Embedding.from_pretrained(torch.tensor(vecs))
    word_embedding = word_embedding.to(device)
    entity_embedding = nn.Embedding(len(vocab) - 1, vecs.shape[1])
    nn.init.normal_(entity_embedding.weight, mean=0, std=1.)
    with torch.no_grad():
        for idx in range(len(vocab) - 1):
            if idx not in non_empty_index:
                entity_embedding.weight[idx] = 0.
    entity_embedding = entity_embedding.to(device)
    optimizer = torch.optim.Adagrad(entity_embedding.parameters(), lr=args.lr)

    dataset = ContrastiveDataset(page_positive_words, negative_freqs,
                                 negative_words, args.positive_num,
                                 args.negative_num)
    dataset = Subset(dataset, page_non_empty_index)
    writer = SummaryWriter(os.path.join(args.logdir, 'phase1'))
    print('Phase 1')
    train(word_embedding,
          entity_embedding,
          optimizer,
          dataset,
          writer,
          start_epochs=1,
          end_epochs=args.phase1_epochs)

    dataset = ContrastiveDataset(context_positive_words, negative_freqs,
                                 negative_words, args.positive_num,
                                 args.negative_num)
    dataset = Subset(dataset, context_non_empty_index)
    writer = SummaryWriter(os.path.join(args.logdir, 'phase2'))
    print('Phase 2')
    train(word_embedding,
          entity_embedding,
          optimizer,
          dataset,
          writer,
          start_epochs=args.phase1_epochs + 1,
          end_epochs=args.phase2_epochs)
Exemplo n.º 32
0
class Main:
    def main(self):
        clearCli()
        self.vocabulary = Vocabulary()
        vocabulary = self.vocabulary
        vocabulary.buildVocabulary()

        isValidCommand = True
        while True:
            quiz = Quiz(vocabulary)
            clearCli()
            print(CLI.main_menu)
            if not isValidCommand:
                print(CLI.invalid_command)
            command = input()
            isValidCommand = command in ['sa', 's', 'sl', 'q', 'j1', 'j2', 'j3', 'la', 't', 'o']
            if isValidCommand:
                if command == 'sa':
                    print('Starting quiz!\n\n')
                    language = self.selectLanguage()
                    quiz.startall(language)
                elif command == 's':
                    numQuestions = self.selectNumQuestions()
                    language = self.selectLanguage()
                    print('Starting quiz!\n\n')
                    quiz.start(language, numQuestions)
                elif command == 'sl':
                    startLesson, endLesson = self.selectLessons()
                    language = self.selectLanguage()
                    print('Starting quiz!\n\n')
                    quiz.start(language, startLesson=startLesson,
                               endLesson=endLesson)
                elif command == 'j1':
                    language = self.selectLanguage()
                    print('Starting quiz for Japanese 1 vocabulary!\n\n')
                    quiz.start(language, startLesson=1, endLesson=10)
                elif command == 'j2':
                    language = self.selectLanguage()
                    print('Starting quiz for Japanese 2 vocabulary!\n\n')
                    quiz.start(language, startLesson=11, endLesson=20)
                elif command == 'j3':
                    language = self.selectLanguage()
                    print('Starting quiz for Japanese 3 vocabulary!\n\n')
                    quiz.start(language, startLesson=21, endLesson=32)
                elif command == 'o':
                    print('Starting open ended quiz')
                    startLesson, endLesson = self.selectLessons()
                    quiz.start_open_ended(startLesson=startLesson, endLesson=endLesson)
                elif command == 'q':
                    print('Quiting program')
                    break
                elif command == 'la':
                    print('Listing all vocabulary')
                    vocabulary.printWholeVocabulary()
                elif command == 't':
                    print('Test')
                    self.testFunction()

    def testFunction(self):
        kksi = kakasi()
        kksi.setMode("J", "H")
        conv = kksi.getConverter()
        all_hiragana = 'がくせい'
        partial_hiragana1 = '学せい'
        partial_hiragana2 = 'がく生'
        all_kanji = '学生'
        print(conv.do(all_hiragana))
        print(conv.do(partial_hiragana1))
        print(conv.do(partial_hiragana2))
        print(conv.do(all_kanji))  
        print(conv.do(all_hiragana) == conv.do(partial_hiragana1) == conv.do(partial_hiragana2) == conv.do(all_kanji))
        input()
    
    def selectNumQuestions(self):
        clearCli()
        print('How many questions?')
        while True:
            value = input()
            isANumber = value.isnumeric()
            isNumberWithinVocabSize = isANumber and 1 <= int(value) <= self.vocabulary.getVocabularySize()
            if isNumberWithinVocabSize:
                break
            clearCli()
            print('How many questions?')
            if not isANumber: 
                print('Not a number')
            elif isANumber and not isNumberWithinVocabSize:
                print('Invalid value. Vocabulary size is', self.vocabulary.getVocabularySize())
        numQuestions = int(value)
        return numQuestions

    def selectLanguage(self):
        clearCli()
        print('Language of questions? (jp/en)')
        while True:
            value = input()
            isValidInput = value == 'jp' or value == 'en'
            if isValidInput:
                break
            clearCli()
            print('Language of questions? (jp/en)')
            print(CLI.invalid_command)
        return value

    def selectLessons(self):
        clearCli()
        print('Do you want to do a (s)ingle or a (r)ange of lessons? (s/r)')
        while True:
            value = input()
            isValidInput = value == 's' or value == 'r'
            if isValidInput:
                break
            clearCli()
            print('Do you want to do a (s)ingle or a (r)ange of lessons? (s/r)')
            print(CLI.invalid_command)
        if value == 's':
            startLesson, endLesson = self.selectSingleLesson()
        elif value == 'r':
            startLesson, endLesson = self.selectRangeOfLessons()
        return startLesson, endLesson
    
    def selectSingleLesson(self):
        clearCli()
        print('Type lesson number?')
        while True:
            value = input()
            isValidInput = self.vocabulary.hasLesson(int(value))
            if isValidInput:
                print('Valid lesson', value)
                break
            clearCli()
            print('Type lesson number?')
            print('Lesson does not exist')
        selectedLesson = value
        return selectedLesson , selectedLesson

    def selectRangeOfLessons(self):
        clearCli()
        print('Type start lesson number?')
        while True:
            value = input()
            isValidInput = self.vocabulary.hasLesson(int(value))
            if isValidInput:
                print('Valid lesson', value)
                break
            clearCli()
            print('Type start lesson number?')
            print('Lesson does not exist')
        startLesson = value

        clearCli()
        print('Start lesson: ', startLesson)
        print('Type end lesson number?')
        while True:
            value = input()
            isLargerThanStart = int(value) > int(startLesson) 
            doesLessonExist = self.vocabulary.hasLesson(int(value))
            isValidInput = isLargerThanStart and doesLessonExist
            if isValidInput:
                print('Valid lesson', value)
                break
            clearCli()
            print('Start lesson: ', startLesson)
            print('Type end lesson number?')
            if not doesLessonExist: print('Lesson does not exist')
            elif not isLargerThanStart: print('End lesson should be greater than start lesson')
        endLesson = value
        assert int(endLesson) > int(startLesson)
        return startLesson, endLesson
def main(args):
    assert FLAGS.training_data_loader, "--training_data_loader is required"
    assert FLAGS.vocab_file, "--vocab_file is required"
    assert FLAGS.train_dir, "--train_dir is required"

    model_config = configuration.ModelConfig()
    training_config = configuration.TrainingConfig()

    print('Loading vocabulary file...')
    vocab = Vocabulary(FLAGS.vocab_file)
    vocab_size = vocab.get_vocabulary_size()

    # Assign parameters to model configuration.
    model_config.vocab_size = vocab_size
    training_config.train_dir = FLAGS.train_dir
    training_config.num_iterations = FLAGS.number_of_steps
    training_config.log_every_n_steps = FLAGS.log_every_n_steps
    training_config.validation_loss_every_n_steps = FLAGS.validation_loss_every_n_steps

    # Create training directory.
    if not tf.gfile.IsDirectory(training_config.train_dir):
        tf.logging.info("Creating training directory: %s",
                        training_config.train_dir)
        tf.gfile.MakeDirs(training_config.train_dir)

    # Build the TensorFlow graph.
    g = tf.Graph()
    with g.as_default():
        print('Building LSTM decoder model...')
        if not FLAGS.repeated_feed_images:
            model = LSTMDecoder(model_config, mode="train")
        else:
            model = LSTMDecoderRepeatedImageFeed(model_config, mode="train")
        model.build()

        # Setup learning rate decay.
        num_batches_per_epoch = (training_config.num_examples_per_epoch /
                                 model_config.batch_size)
        decay_steps = int(num_batches_per_epoch *
                          training_config.num_epochs_per_decay)
        global_step = tf.Variable(0, name='global_step', trainable=False)
        learning_rate = tf.train.exponential_decay(
            training_config.initial_learning_rate,
            global_step,
            decay_steps=decay_steps,
            decay_rate=training_config.learning_rate_decay_factor,
            staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)

        # Setup optimizer.
        optimizer = tf.train.AdamOptimizer(learning_rate)
        train = optimizer.minimize(model.total_loss, global_step=global_step)

        # Setup summary.
        all_summary = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train')
        val_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/val')

        # Create saver
        saver = tf.train.Saver(
            max_to_keep=training_config.max_checkpoints_to_keep)

        # Initialize variables.
        print('Initializing variables...')
        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)

        print('Initializing data loader for training set...')
        start = time.time()
        data_loader_train = DataLoader()
        data_loader_train.load(FLAGS.training_data_loader)
        end = time.time()
        time_elapsed = end - start
        print('Finished initializing data loader (time elapsed: %f)' %
              time_elapsed)

        print('Initializing data loader for validation set...')
        start = time.time()
        data_loader_val = DataLoader()
        data_loader_val.load(FLAGS.validation_data_loader)
        end = time.time()
        time_elapsed = end - start
        print('Finished initializing data loader (time elapsed: %f)' %
              time_elapsed)

        print('Start training...')
        # Stochastic Gradient Descent
        for i in range(training_config.num_iterations):
            print('Sampling mini-batch...')
            image_features, input_sequence, input_mask, target_sequence =\
                data_loader_train.segmental_sampling(batch_size=training_config.batch_size,
                                                     num_segments=model_config.num_segments)

            _, total_loss, summary = sess.run(
                (train, model.total_loss, all_summary),
                feed_dict={
                    "input_features:0": image_features,
                    "input_feed:0": input_sequence,
                    "input_mask:0": input_mask,
                    "target_sequences:0": target_sequence
                })
            train_writer.add_summary(summary, i)

            # Logging
            if i % training_config.log_every_n_steps == 0:
                print('[%d/%d] loss: %f' %
                      (i, training_config.num_iterations, total_loss))

            # Save model.
            if i % training_config.save_every_n_steps == 0:
                print('Saving model at step %d...' % i)
                saver.save(sess, FLAGS.train_dir + '/model', global_step=i)

            # evaluate the loss with validation set at every epoch
            if i % training_config.validation_loss_every_n_steps == 0:
                image_features, input_sequence, input_mask, target_sequence = \
                    data_loader_val.segmental_sampling(batch_size=training_config.batch_size,
                                                       num_segments=model_config.num_segments)

                total_loss, summary = sess.run(
                    (model.total_loss, all_summary),
                    feed_dict={
                        "input_features:0": image_features,
                        "input_feed:0": input_sequence,
                        "input_mask:0": input_mask,
                        "target_sequences:0": target_sequence
                    })
                val_writer.add_summary(summary, i)
Exemplo n.º 34
0
def load_view(view_name: str):
    """Return a given view from a UI file."""
    return ui.load_view(os.path.join(UI_DIR, view_name))


if __name__ == '__main__':
    # This `builtins` trick fixes a problem where launching the script from
    # the home screen can cause multiple instances to run at once.
    # https://forum.omz-software.com/topic/4097/home-screen-alias-is-script-already-running/
    try:
        (vocab, jinja2env, lookup_view, word_view,
         compact_word_view, about_view, container) = builtins.wordroom
    except (AttributeError, ValueError):
        container = None
    if isinstance(container, ui.View) and container.on_screen:
        pass  # reuse the original globals
    else:  # initialize new globals
        vocab = Vocabulary(data_file=VOCABULARY_FILE)
        jinja2env = Environment(loader=FileSystemLoader(HTML_DIR))
        lookup_view = load_view('lookup')
        word_view = load_view('word')
        compact_word_view = load_view('word')
        about_view = load_view('about')
        container = AdaptiveView(lookup_view, word_view)
        container.name = 'WordRoom'
        container.present('fullscreen', hide_title_bar=True)
        builtins.wordroom = (vocab, jinja2env, lookup_view, word_view,
                             compact_word_view, about_view, container)
    # if appex.is_running_extension():
    #    load_word_view(appex.get_text())
Exemplo n.º 35
0
    def chat(self, question, chat_settings):
        """Chat with the chatbot model by predicting an answer to a question.
        'question' and 'answer' in this context are generic terms for the interactions in a dialog exchange
        and can be statements, remarks, queries, requests, or any other type of dialog speech.
        For example:
        Question: "How are you?"     Answer: "Fine."
        Question: "That's great."    Answer: "Yeah."

        Args:
            question: The input question for which the model should predict an answer.

            chat_settings: The ChatSettings instance containing the chat settings and inference hyperparameters

        Returns:
            q_with_hist: question with history if chat_settings.show_question_context = True otherwise None.

            answers: array of answer beams if chat_settings.show_all_beams = True otherwise the single selected answer.
            
        """
        #Process the question by cleaning it and converting it to an integer encoded vector
        question = Vocabulary.clean_text(question)
        question = self.input_vocabulary.words2ints(question)

        #Prepend the currently tracked steps of the conversation history separated by EOS tokens.
        #This allows for deeper dialog context to influence the answer prediction.
        question_with_history = []
        for i in range(len(self.conversation_history)):
            question_with_history += self.conversation_history[i] + [
                self.input_vocabulary.eos_int()
            ]
        question_with_history += question

        #Get the answer prediction
        batch = np.zeros((1, len(question_with_history)))
        batch[0] = question_with_history
        max_output_sequence_length = chat_settings.inference_hparams.max_answer_words + 1  # + 1 since the EOS token is counted as a timestep
        predicted_answer_info = self.predict_batch(
            inputs=batch,
            input_sequence_length=np.array([len(question_with_history)]),
            max_output_sequence_length=max_output_sequence_length,
            beam_length_penalty_weight=chat_settings.inference_hparams.
            beam_length_penalty_weight,
            sampling_temperature=chat_settings.inference_hparams.
            sampling_temperature,
            log_summary=chat_settings.inference_hparams.log_summary)

        #Read the answer prediction
        answer_beams = []
        if self.beam_width > 0:
            #For beam search decoding: if show_all_beams is enabeled then output all beams (sequences), otherwise take the first beam.
            #   The beams (in the "predictions" matrix) are ordered with the highest ranked beams first.
            beam_count = 1 if not chat_settings.show_all_beams else len(
                predicted_answer_info["predictions_seq_lengths"][0])
            for i in range(beam_count):
                predicted_answer_seq_length = predicted_answer_info[
                    "predictions_seq_lengths"][0][
                        i] - 1  #-1 to exclude the EOS token
                predicted_answer = predicted_answer_info["predictions"][
                    0][:predicted_answer_seq_length, i].tolist()
                answer_beams.append(predicted_answer)
        else:
            #For greedy / sampling decoding: only one beam (sequence) is returned, based on the argmax for greedy decoding
            #   or the sampling distribution for sampling decoding. Return this beam.
            beam_count = 1
            predicted_answer_seq_length = predicted_answer_info[
                "predictions_seq_lengths"][0] - 1  #-1 to exclude the EOS token
            predicted_answer = predicted_answer_info["predictions"][
                0][:predicted_answer_seq_length].tolist()
            answer_beams.append(predicted_answer)

        #Add new conversation steps to the end of the history and trim from the beginning if it is longer than conv_history_length
        self.conversation_history.append(question)
        self.conversation_history.append(answer_beams[0])
        self.trim_conversation_history(
            chat_settings.inference_hparams.conv_history_length)

        #Convert the answer(s) to text and return
        answers = []
        for i in range(beam_count):
            answer = self.output_vocabulary.ints2words(answer_beams[i])
            answers.append(answer)

        q_with_hist = None if not chat_settings.show_question_context else self.output_vocabulary.ints2words(
            question_with_history)
        if chat_settings.show_all_beams:
            return q_with_hist, answers
        else:
            return q_with_hist, answers[0]
Exemplo n.º 36
0
def train(trainFile, devFile, gramsNumber, smoothStrategy, BLaplace):
    # process data
    with open(trainFile, "r") as f:
        corpusTrain = f.readlines()
    with open(devFile, "r") as f:
        corpusDev = f.readlines()
    corpusTrainDev = corpusTrain + corpusDev

    if smoothStrategy == "laplace":
        vocab = Vocabulary(gramsNumber, corpusTrainDev)
        vocab.tune_with_Laplace_smoothing(BLaplace)
    elif smoothStrategy == "held_out":
        vocab = Vocabulary(gramsNumber, corpusTrain)
        vocab.tune_with_held_out_smoothing(corpusDev)
    elif smoothStrategy == "cross_valid":
        vocab = Vocabulary(gramsNumber, corpusTrain)
        vocab.tune_with_cross_val_smoothing(corpusDev)
    elif smoothStrategy == "good_turing":
        vocab = Vocabulary(gramsNumber, corpusTrain)
        vocab.tune_with_good_turing_smoothing()
    else:
        raise KeyError
    return vocab
Exemplo n.º 37
0
    return avi_data, targets, lengths





if __name__ == '__main__':
    import time
    from torch.autograd import Variable
    from torch.nn.utils.rnn import pack_padded_sequence
    from checkpoint import *

    json_file = 'data/testing_label.json'
    numpy_file = 'data/testing_data/feat'

    helper = Vocabulary(json_file, min_word_count=5)

    dataset = TrainingDataset(label_json_file=json_file, training_data_path=numpy_file, helper=helper, load_into_ram=True)

    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=8, collate_fn=collate_fn)

    ss = time.time()

    for epoch in range(1):

        s = time.time()

        print('epoch: {}'.format(epoch+1))
        for batch_n, batch in enumerate(dataloader):

            #e = time.time()
Exemplo n.º 38
0
    def __init__(self,
                 in_vocab,
                 output_vocabularies,
                 state_encoder_builder,
                 valid_action_fn,
                 args):
        SconeModel.__init__(self,
                            state_encoder_builder,
                            in_vocab,
                            args.embeddings_size,
                            args.num_enc_layers,
                            args.encoder_size,
                            args.decoder_size,
                            RNNBuilder)

        self.args = args
        self._dropout = 0.

        # Output vocabs and embeddings.
        self.output_action_vocabulary = Vocabulary(output_vocabularies[0], [EOS, BEG])
        self.output_location_vocabulary.g = Vocabulary(output_vocabularies[1], [NO_ARG, BEG])
        self.output_argument_vocabulary = Vocabulary(output_vocabularies[2], [NO_ARG, BEG])

        # All outputs vocabulary.
        all_vocabulary_list = []
        self._valid_action_indices = []
        index = 0
        for action in self.output_action_vocabulary:
            for location in self.output_location_vocabulary.g:
                for argument in self.output_argument_vocabulary:
                    if action != BEG and location != BEG and argument != BEG:
                        if valid_action_fn(action, location, argument):
                            self._valid_action_indices.append(index)
                        all_vocabulary_list.append((action, location, argument))
                        index += 1
        self._all_output_vocabulary = Vocabulary(all_vocabulary_list, [])

        self._output_action_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_action_vocabulary),
             args.embeddings_size),
            name="output-action-embeddings")
        self._output_location_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_location_vocabulary.g),
             args.embeddings_size),
            name="output-location-embeddings")
        self._output_argument_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_argument_vocabulary),
             args.embeddings_size),
            name="output-argument-embeddings")

        # Action decoder RNN.
        self._dec_input_size = args.encoder_size * 2 \
            + args.encoder_size * 2 \
            + self._state_encoder.item_size() * 2 \
            + args.embeddings_size * 3
        self._decoder = RNNBuilder(args.num_dec_layers,
                                   self._dec_input_size,
                                   args.decoder_size,
                                   self._pc)

        situated_in_size = self._dec_input_size
        if self.args.always_initial_state:
            self._state_attention_winitial = self._pc.add_parameters(
                (self.args.encoder_size * 2 + self.args.decoder_size,
                 self._state_encoder.item_size()),
                name="state-attention-winitial")
            self._state_attention_winitial2 = self._pc.add_parameters(
                (self.args.encoder_size * 2 + self.args.decoder_size,
                 self._state_encoder.item_size()),
                name="state-attention-winitial2")
            situated_in_size += 2 * self._state_encoder.item_size()

        # MLP parameters to mix the situated embedding.
        self._situated_w = self._pc.add_parameters(
            (self._dec_input_size, situated_in_size),
            name="situated-w")
        self._situated_b = self._pc.add_parameters((self._dec_input_size),
                                                   name="situated-b")

        # Project the RNN output to a vector that is the length of the output
        # vocabulary.
        self._final_w = self._pc.add_parameters(
            (args.decoder_size, args.decoder_size), name="final-w")

        self._output_w_action = self._pc.add_parameters(
            (len(self.output_action_vocabulary) - 1, args.decoder_size),
            name="output-w-action")
        self._output_w_location = self._pc.add_parameters(
            (len(self.output_location_vocabulary.g) - 1, args.decoder_size),
            name="output-w-location")
        self._output_w_argument = self._pc.add_parameters(
            (len(self.output_argument_vocabulary) - 1, args.decoder_size),
            name="output-w-argument")
Exemplo n.º 39
0
class ChatBot:
    def __init__(self, layers=5, maxlen=10, embedding_size=128, batch_size=32, is_train=True, lr=0.0001):
        self.layers = layers
        self.maxlen = maxlen
        self.embedding_size = embedding_size
        self.batch_size = batch_size
        self.learning_rate = lr
        self.model_path = "model/chatbot/model.npz" #what is npz? It is the extension , it is the file in which we save the weight of our seq2seq model.

        ## Vocabulary
        self.vocab = Vocabulary(corpus=None, maxlen=maxlen)
        self.vocab_size = self.vocab.vocab_size

        ## Init Session
        sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        tf.reset_default_graph()
        self.sess = tf.Session(config=sess_config)

        ## Placeholders
        self.encoder_inputs = tf.placeholder(tf.int32, shape=[None, None])
        self.decoder_inputs = tf.placeholder(tf.int32, shape=[None, None])
        self.decoder_outputs = tf.placeholder(tf.int32, shape=[None, None])
        self.mask = tf.placeholder(tf.int32, shape=[None, None])

        ## Model
        self.net_out, _ = self.create_model(
            self.encoder_inputs,
            self.decoder_inputs,
            self.vocab_size,
            self.embedding_size,
            reuse=False)
        self.net_out.print_params(False)

        self.loss = tl.cost.cross_entropy_seq_with_mask(
            logits=self.net_out.outputs,
            target_seqs=self.decoder_outputs,
            input_mask=self.mask,
            return_details=False,
            name='cost')

        ## Optimizer
        self.train_op = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

    def train(self, X, Y, num_epochs=1):
        ## Init Vars
        self.sess.run(tf.global_variables_initializer())

        ## Load Model
        tl.files.load_and_assign_npz(sess=self.sess, name=self.model_path, network=self.net_out)

        n_step = len(X)//self.batch_size

        for epoch in range(num_epochs):
            X, Y = shuffle(X, Y, random_state=0)
            total_loss, n_iter = 0, 0
            for x, y in tqdm(tl.iterate.minibatches(
                inputs=X,
                targets=Y,
                batch_size=self.batch_size,
                shuffle=False),
                total=n_step,
                desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs),
                leave=False):

                x1, x2, y1, W = self.vocab.dataset(x, y)
                feed_data = {}
                feed_data[self.encoder_inputs] = x1
                feed_data[self.decoder_inputs] = x2
                feed_data[self.decoder_outputs] = y1
                feed_data[self.mask] = W


                _, loss_iter = self.sess.run([self.train_op, self.loss], feed_dict=feed_data)
                total_loss += loss_iter
                n_iter += 1

            ## printing average loss after every epoch
            print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))

            ## saving the model
            tl.files.save_npz(self.net_out.all_params, name=self.model_path, sess=self.sess)

        ## session cleanup
        self.sess.close()


    """
    Creates the LSTM Model
    """
    def create_model(self, encoder_inputs, decoder_inputs, vocab_size, emb_dim, is_train=True, reuse=False):
        with tf.variable_scope("model", reuse=reuse):
            # for chatbot, you can use the same embedding layer,
            # for translation, you may want to use 2 seperated embedding layers # embedding layers?
            with tf.variable_scope("embedding") as vs:
                net_encode = EmbeddingInputlayer(
                    inputs = encoder_inputs,
                    vocabulary_size = vocab_size,
                    embedding_size = emb_dim,
                    name = 'seq_embedding')
                vs.reuse_variables()
                net_decode = EmbeddingInputlayer(
                    inputs = decoder_inputs,
                    vocabulary_size = vocab_size,
                    embedding_size = emb_dim,
                    name = 'seq_embedding')

            net_rnn = Seq2Seq(net_encode, net_decode,
                    cell_fn = tf.nn.rnn_cell.LSTMCell,
                    n_hidden = emb_dim,
                    initializer = tf.random_uniform_initializer(-0.1, 0.1),
                    encode_sequence_length = retrieve_seq_length_op2(encoder_inputs),
                    decode_sequence_length = retrieve_seq_length_op2(decoder_inputs),
                    initial_state_encode = None,
                    dropout = (0.5 if is_train else None),
                    n_layer = self.layers,
                    return_seq_2d = True,
                    name = 'seq2seq')

            net_out = DenseLayer(net_rnn, n_units=vocab_size, act=tf.identity, name='output')
        return net_out, net_rnn


    def infer(self, query):
        unk_id = self.vocab.word_index["<unk>"]
        pad_id = self.vocab.word_index["<pad>"]

        start_id = self.vocab.word_index["<start>"]
        end_id = self.vocab.word_index["<end>"]

        ## Init Session
        sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        tf.reset_default_graph()
        sess = tf.Session(config=sess_config)

        ## Inference Data Placeholders
        encode_inputs = tf.placeholder(dtype=tf.int64, shape=[1, None], name="encode_inputs")
        decode_inputs = tf.placeholder(dtype=tf.int64, shape=[1, None], name="decode_inputs")

        net, net_rnn = self.create_model(
            encode_inputs,
            decode_inputs,
            self.vocab_size,
            self.embedding_size,
            is_train=False,
            reuse=False)
        y = tf.nn.softmax(net.outputs)

        ## Init Vars
        sess.run(tf.global_variables_initializer())

        ## Load Model
        tl.files.load_and_assign_npz(sess=sess, name=self.model_path, network=net)

        """
        Inference using pre-trained model
        """
        def inference(seed):
            seed_id = self.vocab.text_to_sequence(seed)

            ## Encode and get state
            state = sess.run(net_rnn.final_state_encode, {encode_inputs: [seed_id]})

            ## Decode, feed start_id and get first word [https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_ptb_lstm_state_is_tuple.py]
            o, state = sess.run([y, net_rnn.final_state_decode], {
                net_rnn.initial_state_decode: state,
                decode_inputs: [[start_id]]})
            w_id = tl.nlp.sample_top(o[0], top_k=3)
            #w = self.vocab.index_word[w_id]

            ## Decode and feed state iteratively
            sentence = [w_id]
            for _ in range(self.maxlen): # max sentence length
                o, state = sess.run([y, net_rnn.final_state_decode],{
                    net_rnn.initial_state_decode: state,
                    decode_inputs: [[w_id]]})
                w_id = tl.nlp.sample_top(o[0], top_k=2)
                #w = self.vocab.index_word[w_id]
                if w_id == end_id:
                    break
                sentence = sentence + [w_id]
            return sentence

        ## infer
        sentence = inference(query)
        response = self.vocab.seqs_to_text(sentence)
        response = " ".join(response.split(" "))
        return response
Exemplo n.º 40
0
def prepare_data(config):

    print('Loading data for ' + config.phase)
    if config.phase == 'train':
        filetemp = os.path.join(config.train_dir, config.temp_train_file)
    elif config.phase == 'eval':
        filetemp = os.path.join(config.eval_dir, config.temp_eval_file)
    elif config.phase == 'test':
        filetemp = os.path.join(config.test_dir, config.temp_test_file)

    data = np.load(filetemp).item()
    src = data['src']
    dst = data['dst']

    #
    print("Building the vocabulary...")

    vocabulary1 = Vocabulary(config.vocab1_size, save_file=config.vocab1_file)
    #vocabulary1.save(config.vocab1_file)
    print("Vocabulary built.")

    #
    if config.phase == 'train':
        filetemp = os.path.join(config.train_dir, config.train_file)
    elif config.phase == 'eval':
        filetemp = os.path.join(config.eval_dir, config.eval_file)
    elif config.phase == 'test':
        filetemp = os.path.join(config.test_dir, config.test_file)

    if True:  #not os.path.exists(filetemp):
        word_idxs1, word_idxs2 = [], []
        masks1, masks2 = [], []
        len1, len2 = [], []
        for sent in src:  #tqdm(src):
            current_word_idxs_ = vocabulary1.process_sentence(sent)
            current_num_words = len(current_word_idxs_)
            #
            len1.append(len(current_word_idxs_))
            #print('len(current_word_idxs_)', len(current_word_idxs_))

            current_word_idxs = np.zeros(config.max_input_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_input_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs1.append(current_word_idxs)
            masks1.append(current_masks)

        print('src max length', max(len1))
        #
        #import pdb;pdb.set_trace()
        for sent in dst:  #tqdm(dst):
            current_word_idxs_ = vocabulary1.process_sentence(sent + ' stop')
            current_num_words = len(current_word_idxs_)
            #
            len2.append(len(current_word_idxs_))
            #print('len(current_word_idxs_)', len(current_word_idxs_))

            current_word_idxs = np.zeros(config.max_output_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_output_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs2.append(current_word_idxs)
            masks2.append(current_masks)

        print('dst max length', max(len2))
        #
        word_idxs1 = np.array(word_idxs1)
        masks1 = np.array(masks1)
        word_idxs2 = np.array(word_idxs2)
        masks2 = np.array(masks2)
        len1 = np.array(len1)
        len2 = np.array(len2)
        data = {
            'word_idxs1': word_idxs1,
            'masks1': masks1,
            'word_idxs2': word_idxs2,
            'masks2': masks2,
            'len1': len1,
            'len2': len2
        }
        np.save(filetemp, data)
    else:
        data = np.load(filetemp).item()
        word_idxs1 = data['word_idxs1']
        masks1 = data['masks1']
        len1 = data['len1']
        word_idxs2 = data['word_idxs2']
        masks2 = data['masks2']
        len2 = data['len2']
    #
    print("Building the dataset...")
    is_train = config.phase == 'train'
    dataset = DataSet(word_idxs1,
                      masks1,
                      len1,
                      config.batch_size,
                      word_idxs2,
                      masks2,
                      len2,
                      is_train=is_train,
                      shuffle=is_train)
    print("Dataset built.")
    print("prepare data for " + config.phase + " done!")
    return dataset, vocabulary1  #, vocabulary2
Exemplo n.º 41
0
class ConstrainedContextSeq2SeqEmbeddings(SconeModel):
    """Model that predicts a sequence of actions (action and arguments).

    Attributes:

    Todo:
        * Consider refactoring. E.g., have a class for an encoder and a
            decoder.
        * Fewer parameters in the constructor.
    """

    def __init__(self,
                 in_vocab,
                 output_vocabularies,
                 state_encoder_builder,
                 valid_action_fn,
                 args):
        SconeModel.__init__(self,
                            state_encoder_builder,
                            in_vocab,
                            args.embeddings_size,
                            args.num_enc_layers,
                            args.encoder_size,
                            args.decoder_size,
                            RNNBuilder)

        self.args = args
        self._dropout = 0.

        # Output vocabs and embeddings.
        self.output_action_vocabulary = Vocabulary(output_vocabularies[0], [EOS, BEG])
        self.output_location_vocabulary.g = Vocabulary(output_vocabularies[1], [NO_ARG, BEG])
        self.output_argument_vocabulary = Vocabulary(output_vocabularies[2], [NO_ARG, BEG])

        # All outputs vocabulary.
        all_vocabulary_list = []
        self._valid_action_indices = []
        index = 0
        for action in self.output_action_vocabulary:
            for location in self.output_location_vocabulary.g:
                for argument in self.output_argument_vocabulary:
                    if action != BEG and location != BEG and argument != BEG:
                        if valid_action_fn(action, location, argument):
                            self._valid_action_indices.append(index)
                        all_vocabulary_list.append((action, location, argument))
                        index += 1
        self._all_output_vocabulary = Vocabulary(all_vocabulary_list, [])

        self._output_action_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_action_vocabulary),
             args.embeddings_size),
            name="output-action-embeddings")
        self._output_location_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_location_vocabulary.g),
             args.embeddings_size),
            name="output-location-embeddings")
        self._output_argument_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_argument_vocabulary),
             args.embeddings_size),
            name="output-argument-embeddings")

        # Action decoder RNN.
        self._dec_input_size = args.encoder_size * 2 \
            + args.encoder_size * 2 \
            + self._state_encoder.item_size() * 2 \
            + args.embeddings_size * 3
        self._decoder = RNNBuilder(args.num_dec_layers,
                                   self._dec_input_size,
                                   args.decoder_size,
                                   self._pc)

        situated_in_size = self._dec_input_size
        if self.args.always_initial_state:
            self._state_attention_winitial = self._pc.add_parameters(
                (self.args.encoder_size * 2 + self.args.decoder_size,
                 self._state_encoder.item_size()),
                name="state-attention-winitial")
            self._state_attention_winitial2 = self._pc.add_parameters(
                (self.args.encoder_size * 2 + self.args.decoder_size,
                 self._state_encoder.item_size()),
                name="state-attention-winitial2")
            situated_in_size += 2 * self._state_encoder.item_size()

        # MLP parameters to mix the situated embedding.
        self._situated_w = self._pc.add_parameters(
            (self._dec_input_size, situated_in_size),
            name="situated-w")
        self._situated_b = self._pc.add_parameters((self._dec_input_size),
                                                   name="situated-b")

        # Project the RNN output to a vector that is the length of the output
        # vocabulary.
        self._final_w = self._pc.add_parameters(
            (args.decoder_size, args.decoder_size), name="final-w")

        self._output_w_action = self._pc.add_parameters(
            (len(self.output_action_vocabulary) - 1, args.decoder_size),
            name="output-w-action")
        self._output_w_location = self._pc.add_parameters(
            (len(self.output_location_vocabulary.g) - 1, args.decoder_size),
            name="output-w-location")
        self._output_w_argument = self._pc.add_parameters(
            (len(self.output_argument_vocabulary) - 1, args.decoder_size),
            name="output-w-argument")

    def probability_of_token(self, token, probability_dist):
        return probability_dist[self._all_output_vocabulary.lookup_index(tuple(token))]

    def set_dropout(self, amount):
        """ Sets the dropout amount for the model, changes during various learning stages.

        Inputs:
            amount (float): Amount of dropout to apply.
        """
        self._dropout = amount

    def compute_entropy(self, distribution):
        """ Gets the entropy of a probability distribution that may contain zeroes.

        Inputs:
            probability_distribution (dy.Expression): The probability distribution.

        Returns:
            dy.Expression representing the entropy.
        """
        num_actions = len(self.output_action_vocabulary) - 1
        num_locations = len(self.output_location_vocabulary.g) - 1
        num_arguments = len(self.output_argument_vocabulary) - 1
        valid_mask = numpy.zeros(num_actions * num_locations * num_arguments)
        for index in self._valid_action_indices:
            valid_mask[index] = 1.
        # This mask is one for all valid indices, and zero for all others.
        valid_mask = dy.inputTensor(valid_mask)

        # This basically replaces everything in the probability distribution
        # with the original value (if valid), or zero (if not valid).
        valid_probs = dy.cmult(valid_mask, distribution)

        # The inverse of valid mask, this gives a value of 1. if something is invalid.
        invalid_probs = 1.-valid_mask

        # The result of this operation is that everything that's valid gets its
        # original probability, and everything that's not gets a probability of 1.
        probs = valid_probs + invalid_probs

        # dy.log(probs) will give log(p(action)) if action is valid, and
        # log(1)=0 for invalid actions.
        # then entropies will be zero for everything that isn't valid, and the
        # actual p log(p) otherwise.
        entropies = dy.cmult(probs, dy.log(probs + 0.00000000001))
        return -dy.sum_elems(entropies)


    def action_probabilities(self, distribution):
        num_actions = len(self.output_action_vocabulary) - 1
        num_locations = len(self.output_location_vocabulary.g) - 1
        num_arguments = len(self.output_argument_vocabulary) - 1
        zeroes = numpy.zeros(num_locations * num_arguments)
        ones = numpy.ones(num_locations * num_arguments)
        
        actions_masks = []
        probs = { }
        action_idx = 0
        for action in self.output_action_vocabulary:
            if action != BEG:
                masks = numpy.concatenate(
                            (numpy.repeat(zeroes, action_idx),
                             ones,
                             numpy.repeat(zeroes, num_actions - action_idx - 1)))
                actions_masks = dy.reshape(dy.inputTensor(masks),
                                           (num_actions * num_locations * num_arguments, 1))
                action_prob = dy.sum_elems(dy.cmult(actions_masks, distribution))
                probs[action] = action_prob
                action_idx += 1
        return probs

    def group_tokens(self, string):
        """ Groups tokens from a flat list of strings into action sequence.

        Inputs:
            string (list of str): Flat action sequence.

        Returns:
            list of tuple, representing parameterized actions.
        """
        seq = []
        current_triple = []
        for token in string:
            if token in self.output_action_vocabulary:
                if len(current_triple) == 3:
                    # Push the current triple and add this one
                    seq.append(current_triple)
                elif len(current_triple) < 3 and current_triple:
                    # Means that there were no arguments
                    current_triple.extend(
                        [NO_ARG for _ in range(3 - len(current_triple))])
                    assert len(current_triple) == 3
                    seq.append(current_triple)
                current_triple = [token]
            elif token in self.output_location_vocabulary.g:
                assert len(current_triple) == 1, \
                    "Location " + str(token) + " must follow an action," \
                    + " but current triple was " + str(current_triple)
                current_triple.append(token)
            elif token in self.output_argument_vocabulary:
                assert len(current_triple) == 2, \
                    "Argument " + str(token) + " must follow an action and location," \
                    + " but current triple was " + str(current_triple)
                current_triple.append(token)
        if len(current_triple) < 3 and current_triple:
            current_triple.extend(
                [NO_ARG for _ in range(3 - len(current_triple))])
        assert len(current_triple) == 3 or not current_triple
        if len(current_triple) == 3:
            seq.append(current_triple)
        return seq

    def _out_to_int(self, string, add_eos=False):
        if add_eos:
            string = list(string) + [EOS]
        else:
            string = list(string)

        return [(self.output_action_vocabulary.lookup_index(tok[0]),
                 self.output_location_vocabulary.g.lookup_index(tok[1]),
                 self.output_argument_vocabulary.lookup_index(tok[2])) \
                    for tok in self.group_tokens(string)]

    def _get_probs(self, rnn_output, restrict=None):
        final_w = dy.parameter(self._final_w)
        output_w_action = dy.parameter(self._output_w_action)
        output_w_location = dy.parameter(self._output_w_location)
        output_w_argument = dy.parameter(self._output_w_argument)

        intermediate_state = final_w * rnn_output
        if self.args.final_nonlinearity:
            intermediate_state = dy.tanh(intermediate_state)
        action_scores = output_w_action * intermediate_state
        location_scores = output_w_location * intermediate_state
        argument_scores = output_w_argument * intermediate_state

        flattened_scores = flatten_triple(action_scores, location_scores, argument_scores)
        if restrict or self.args.syntax_restricted:
            restrict_tokens = self._valid_action_indices
            if restrict:
                restrict_tokens = restrict
            return dy.exp(dy.log_softmax(flattened_scores,
                                         restrict=restrict_tokens))
        else:
            probs = dy.softmax(flattened_scores)
        return probs

    def _predict(self, rnn_output, fsa_restricted=False, fsa=None):
        # Forces a forward pass to get value.
        probs = self._get_probs(
            rnn_output,
            restrict=fsa.valid_actions(self._all_output_vocabulary) if fsa_restricted else None).value()
        max_tuple = numpy.argmax(probs)
        predicted_token = self._all_output_vocabulary.lookup_token(max_tuple)

        return (predicted_token, probs[max_tuple])

    def _init_decoder(self):
        return self._decoder.initial_state().add_input(dy.vecInput(self._dec_input_size))

    def _embed_predicted_triple(self, triple):
        return dy.concatenate([self._output_action_embeddings[triple[0]],
                               self._output_location_embeddings[triple[1]],
                               self._output_argument_embeddings[triple[2]]])

    def _decoder_input_embedding(self,
                                 rnn_state,
                                 previous_triple,
                                 encoded_string,
                                 enc_state,
                                 encoded_history,
                                 training=False,
                                 initial_state=None):
        attention_vecs = {}

        # Compute attention over encodded string.
        utterance_attn, utterance_dist = attend(encoded_string,
                                                rnn_state.h()[-1],
                                                dy.parameter(self._utterance_attention_w),
                                                self._dropout if training else 0.)
        attention_vecs['utterance'] = utterance_dist

        # Key for state and history attention.
        attn_key = dy.concatenate([utterance_attn, rnn_state.h()[-1]])
        if training:
            attn_key = dy.dropout(attn_key, self._dropout)

        # Attend on history using current state and utterance attention.
        history_attn, history_dist = attend(encoded_history,
                                            attn_key,
                                            dy.parameter(self._history_attention_w),
                                            self._dropout if training else 0.)
        attention_vecs['history'] = history_dist

        # Attend on state.
        state_attn, state_dist = attend(enc_state,
                                        attn_key,
                                        dy.parameter(self._state_attention_w),
                                        self._dropout if training else 0.)
        state_attn2, state_dist2 = attend(enc_state,
                                          attn_key,
                                          dy.parameter(self._state_attention_w2),
                                          self._dropout if training else 0.)
        attention_vecs['state_1'] = state_dist
        attention_vecs['state_2'] = state_dist2

        # Compute previous embedding
        prev_emb = self._embed_predicted_triple(previous_triple)

        # Concatenate with history and state, and mix with a feed-forward
        # layer.
        situated_embedding = dy.concatenate([utterance_attn,
                                             history_attn,
                                             state_attn,
                                             state_attn2,
                                             prev_emb])

        # Attend on initial state (if provided)
        if self.args.feed_updated_state and self.args.always_initial_state:
            if not initial_state:
                raise ValueError("Encoding the initial state but it was not provided.")
            initial_attn, initial_dist = attend(initial_state,
                                                attn_key,
                                                dy.parameter(self._state_attention_winitial),
                                                self._dropout if training else 0.)
            initial_attn2, initial_dist2 = attend(initial_state,
                                                  attn_key,
                                                  dy.parameter(self._state_attention_winitial2),
                                                  self._dropout if training else 0.)
            attention_vecs['initial_1'] = initial_dist
            attention_vecs['initial_2'] = initial_dist2

            situated_embedding = dy.concatenate([situated_embedding,
                                                 initial_attn,
                                                 initial_attn2])

        # Situated embedding mixing parameters.
        weights = dy.parameter(self._situated_w)
        biases = dy.parameter(self._situated_b)

        situated_embedding = dy.tanh(weights * situated_embedding + biases)

        return situated_embedding, attention_vecs

    def get_losses(
            self,
            utterance,
            output_seq,
            state,
            history,
            fsa=None,
            training=False):
        """Gets the losses of a gold sequence.

        Args:
            utterance (list of str): Represents the current utterance.
            output_seq (list of triple of str): Represents the gold output sequence.
            state (WorldState): Represents the state of the environment.
            history (list of list of str): Represents the previous utterances.
            fsa (ExecutableFSA, optional): An FSA builder object.
            training (bool, optional): Whether or not you are training right now.

        Returns:
            list of dy.Expression, where each corresponds to the loss at each
                gold output prediction.

        """
        enc_utterance, enc_history, enc_state = self._encode_inputs(
            utterance, state, history)
        initial_encoded_state = enc_state

        output_seq = self.group_tokens(output_seq + [EOS])

        # Run the decoder (forced decoding).
        rnn_state = self._init_decoder()
        losses = []
        prev_token_ints = (self.output_action_vocabulary.lookup_index(BEG),
                           self.output_location_vocabulary.g.lookup_index(BEG),
                           self.output_argument_vocabulary.lookup_index(BEG))
        for i, output_token in enumerate(output_seq):
            if self.args.feed_updated_state:
                if not fsa:
                    raise ValueError("Attempting to feed the updated state " \
                                     + "no FSA was provided")
                enc_state = self._state_encoder.encode(fsa.state())
            # Compute the decoder input.
            situated_embedding, _ = self._decoder_input_embedding(
                rnn_state,
                prev_token_ints,
                enc_utterance,
                enc_state,
                enc_history,
                training,
                initial_state=initial_encoded_state if self.args.always_initial_state else None)
            if training:
                situated_embedding = dy.dropout(
                    situated_embedding, self._dropout)

            # Weird choice -- not adding previous token generated token
            # embedding. TODO: fix
            rnn_state = rnn_state.add_input(situated_embedding)

            gold_index = self._all_output_vocabulary.lookup_index(tuple(output_token))
            log_prob_token = dy.log(self._get_probs(rnn_state.output())[gold_index])

            if self.args.feed_updated_state and output_token != (EOS, NO_ARG, NO_ARG) and output_token != [EOS, NO_ARG, NO_ARG]:
                fsa.feed_complete_action(*output_token)

            # Loss of labeled token.
            losses.append(-log_prob_token)

            prev_token_ints = (self.output_action_vocabulary.lookup_index(output_token[0]),
                               self.output_location_vocabulary.g.lookup_index(output_token[1]),
                               self.output_argument_vocabulary.lookup_index(output_token[2]))

        return losses

    def _update_rnn_state(self,
                          encoded_states,
                          fsa,
                          rnn_state,
                          previous_token,
                          initial_state=None,
                          training=False):
        """ Generates a single token given a state.
        """
        # Generate only if at the beginning of the sequence or the
        # previously generated token was EOS.
        utterance = encoded_states[0]
        history = encoded_states[1]
        world_state = encoded_states[2]

        if self.args.feed_updated_state:
            if not fsa:
                raise ValueError("Attempting to feed the updated state " \
                                 + "no FSA was provided")
            if not fsa.state():
                raise ValueError("Attempting to feed the updated state " \
                                 + "FSA state was None")
            world_state = self._state_encoder.encode(fsa.state())
        situated_embedding, attentions = self._decoder_input_embedding(
            rnn_state,
            previous_token,
            utterance,
            world_state,
            history,
            initial_state=initial_state,
            training=training)
        if training:
            situated_embedding = dy.dropout(situated_embedding, self._dropout)
        return rnn_state.add_input(situated_embedding), attentions

    def _policy_shape_probs(self,
                            prob_dist):
        # TODO: this is specific to Alchemy
        num_actions = len(self.output_action_vocabulary) - 1
        num_locations = len(self.output_location_vocabulary.g) - 1
        num_arguments = len(self.output_argument_vocabulary) - 1
        new_probdist = dy.zeros(prob_dist.dim()[0])
        zeroes = numpy.zeros(num_locations * num_arguments)
        ones = numpy.ones(num_locations * num_arguments)
        eos_prob = prob_dist[self._all_output_vocabulary.lookup_index((EOS, NO_ARG, NO_ARG))]
        action_idx = 0
        for action in self.output_action_vocabulary:
            masks = numpy.concatenate(
                        (numpy.repeat(zeroes, action_idx),
                         ones,
                         numpy.repeat(zeroes, num_actions - action_idx - 1)))
            actions_masks = dy.reshape(dy.inputTensor(masks),
                                       (num_actions * num_locations * num_arguments, 1))
            if action == EOS:
                new_probdist += dy.cmult(actions_masks, prob_dist) / 2.
            elif action == "push":
                new_probdist += dy.cmult(actions_masks, prob_dist) + eos_prob / (2. * 56.)
            elif action == "pop":
                new_probdist += dy.cmult(actions_masks, prob_dist)

        if self.args.syntax_restricted:
            return dy.exp(dy.log_softmax(dy.cmult(new_probdist, prob_dist),
                                         restrict = self._valid_action_indices))
        else:
            return dy.softmax(dy.cmult(new_probdist, prob_dist))

    def sample_sequences(self,
                         batch,
                         length=LEN_LIMIT,
                         training=False,
                         fsa_builder=None):
        """Rolls out using a policy (the probability distribution.

        Args:
            batch (list of examples): The batch that is being used to roll
                out.
            length (int, optional): The maximum length of the roll out.
            training (bool, optional): Whether or not training.
            fsa_builder (ExecutableFSA): An FSA that can be used to constrain.

        Returns:

        Todo:
            * Docstring.
            * No use of 'filter'.
            * Make returned value more clear.
            * Fewer branches.
            * Shorter (i.e. refactor).
        """
        sample_start = time.time()
        batch_states = []
        batch_initial_states = []
        batch_prob_sequences = [[] for example in batch]
        batch_sequences = [[] for example in batch]
        finished_seqs = [False for example in batch]
        batch_encoded_states = []

        
        for example in batch:
            encoded_inputs = self._encode_inputs(
                example.utterance,
                example.initial_state,
                example.history)
            batch_encoded_states.append(encoded_inputs)
            batch_initial_states.append(encoded_inputs[2])
            initial_state = None
            if self.args.feed_updated_state:
                if not fsa_builder:
                    raise ValueError("Need an FSA builder when feeding the "\
                                     + " updated state during sampling")
                initial_state = fsa_builder(example.initial_state)
            batch_states.append( \
                (initial_state,
                 self._init_decoder(),
                 (self.output_action_vocabulary.lookup_index(BEG),
                  self.output_location_vocabulary.g.lookup_index(BEG),
                  self.output_argument_vocabulary.lookup_index(BEG))))

        for _ in range(length):
            # Generate probabilities for this step.
            batch_probs = []

            batch_rnn_states = []

            assert len(batch) == len(batch_encoded_states)
            assert len(batch) == len(batch_states)

            for j, (example, encoded_states, state, initial_state) in \
                    enumerate(zip(batch, batch_encoded_states, batch_states, batch_initial_states)):

                if not finished_seqs[j]:
                    rnn_state, _ = self._update_rnn_state(encoded_states,
                                                          state[0],
                                                          state[1],
                                                          state[2],
                                                          initial_state,
                                                          training=training)
                    probs = self._get_probs(rnn_state.output())
                else:
                    probs = None
                    rnn_state = None
                batch_probs.append(probs)
                batch_rnn_states.append(rnn_state)

            # Do a forward pass on the entire batch.
            if [prob for prob in batch_probs if prob]:
                dy.esum([dy.concatenate(list(prob))
                         for prob in batch_probs if prob]).value()

                # Update the batch states and keep track of probability distribution
                # and generated sequences.
                new_states = []

                assert len(batch) == len(batch_states)
                assert len(batch) == len(batch_probs)
                assert len(batch) == len(batch_rnn_states)
                for j, (example, old_state, prob_dist, rnn_state) in enumerate(
                        zip(batch, batch_states, batch_probs, batch_rnn_states)):
                    if not finished_seqs[j]:
                        # Get the predicted token by sampling.
                        sampling_policy = prob_dist
                        if self.args.policy_shaping:
                            sampling_policy = self._policy_shape_probs(prob_dist)
                        predicted_token, token_prob = sample_any_tok(
                            sampling_policy, self._all_output_vocabulary)

                        # Update the FSA.
                        fsa = None
                        if self.args.feed_updated_state and predicted_token != (EOS, NO_ARG, NO_ARG):
                            fsa = old_state[0]
                            peek_state = fsa.peek_complete_action(*predicted_token)
                            if peek_state and predicted_token != (EOS, NO_ARG, NO_ARG):
                                fsa.feed_complete_action(*predicted_token)

                        # Only update batch states if you don't predict EOS. Otherwise,
                        # no point in continuing to generate for this example.
                        if predicted_token == (EOS, NO_ARG, NO_ARG):
                            finished_seqs[j] = True
                            new_states.append((None, None, None))
                        else:
                            predicted_token_idxs = \
                                (self.output_action_vocabulary.lookup_index(predicted_token[0]),
                                 self.output_location_vocabulary.g.lookup_index(predicted_token[1]),
                                 self.output_argument_vocabulary.lookup_index(predicted_token[2]))
                            new_states.append(
                                (fsa, rnn_state, predicted_token_idxs))

                        # Update probability expressions and samples.
                        batch_sequences[j].append(
                            (predicted_token, token_prob))
                        batch_prob_sequences[j].append(prob_dist)
                    else:
                        new_states.append((None, None, None))
                batch_states = new_states
            else:
                break

        return batch_prob_sequences, batch_sequences

    def generate_probs(self, utterance, state, history, fsa=None, fsa_restricted=False):
        """Gets predictions (by argmax) and their probabilities.


        Args:
            utterance (list of str): The current utterance.
            state (WorldState): The world state.
            history (list of list of str): The previous utterances.
            fsa (ExecutableFSA, optional): The FSA builder object, if using
                constrained decoding.

        Returns:
            list of (str, float), representing the predicted sequence, where
                each string is the predicted token and the float is the
                probability of the token.
        """
        dy.renew_cg()

        encoded_states = self._encode_inputs(utterance, state, history)
        initial_state = encoded_states[2]

        # Run the decoder.
        rnn_state = self._init_decoder()
        output_seq_probs = []
        attentions = []
        predicted_token_ints = [self.output_action_vocabulary.lookup_index(BEG),
                                self.output_location_vocabulary.g.lookup_index(BEG),
                                self.output_argument_vocabulary.lookup_index(BEG)]
        while len(output_seq_probs) <= LEN_LIMIT:
            # Compute the decoder input.
            rnn_state, attention = self._update_rnn_state(
                encoded_states,
                fsa,
                rnn_state,
                predicted_token_ints,
                initial_state if self.args.always_initial_state else None)
            attentions.append(attention)

            if self.args.fsa_restricted:
                raise ValueError("FSA generation is not implemented " \
                                 + "jointly predicting all three things")
            else:
                predicted_token, prob = self._predict(rnn_state.output(),
                                                      fsa_restricted,
                                                      fsa)

            output_seq_probs.append((predicted_token, prob))
            predicted_token_ints = \
                [self.output_action_vocabulary.lookup_index(predicted_token[0]),
                 self.output_location_vocabulary.g.lookup_index(predicted_token[1]),
                 self.output_argument_vocabulary.lookup_index(predicted_token[2])]
            if predicted_token == (EOS, NO_ARG, NO_ARG):
                return output_seq_probs, attentions
            if self.args.feed_updated_state:
                peek_state = fsa.peek_complete_action(*predicted_token)
                if peek_state:
                    fsa.feed_complete_action(*predicted_token)
        return output_seq_probs, attentions

    def generate(self, utterance, state, history, fsa, fsa_restricted=False):
        """Generates a sequence of predicted tokens for an input.

        Args:
            utterance (list of str): The current utterance.
            state (WorldState): The world state.
            history (list of list of str): The previous utterances.
            fsa (ExecutableFSA): The FSA, for constrained decoding.

        Returns:
            list of str, representing the predicted sequence.

        Todo:
            * Don't use map.
        """
        preds_and_probs, attentions = self.generate_probs(utterance,
                                                          state,
                                                          history,
                                                          fsa,
                                                          fsa_restricted)

        # Get only the tokens and remove the EOS token at the end.
        preds = [p[0] for p in preds_and_probs]
        if list(preds[-1]) == [EOS, NO_ARG, NO_ARG]:
            preds = preds[:-1]
        return preds, attentions
Exemplo n.º 42
0
def main(args):
    assert FLAGS.validation_data_loader, "--vocab_file is required"
    assert FLAGS.vocab_file, "--vocab_file is required"
    assert FLAGS.model_path, "--model_path is required"
    assert FLAGS.selection_method in ['sampling', 'argmax', 'beam_search'],\
        "--selection_method can only be one of 'sampling', 'argmax' and 'beam_search'."
    model_config = configuration.ModelConfig()

    print('Loading vocabulary file...')
    vocab = Vocabulary(FLAGS.vocab_file)
    vocab_size = vocab.get_vocabulary_size()

    # Assign parameters to model configuration.
    model_config.vocab_size = vocab_size

    # Build the TensorFlow graph.
    g = tf.Graph()
    with g.as_default():
        print('Building LSTM decoder model for inference...')

        if not FLAGS.repeated_feed_images:
            model = LSTMDecoder(model_config, mode="inference")
        else:
            model = LSTMDecoderRepeatedImageFeed(model_config,
                                                 mode="inference")
        model.build()

        print('Initializing variables...')
        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)

        print('Loading saved model...')
        saver = tf.train.Saver()
        saver.restore(sess, FLAGS.model_path)

        print('Initializing data loader for validation set...')
        start = time.time()
        data_loader_val = DataLoader()
        data_loader_val.load(FLAGS.validation_data_loader)
        end = time.time()
        time_elapsed = end - start
        print('Finished initializing data loader (time elapsed: %f)' %
              time_elapsed)

        print('Start inference...')
        initial_input_sequence = np.zeros(model_config.batch_size,
                                          dtype=np.int32)
        initial_input_sequence.fill(vocab.start_id)

        max_sentence_length = const_config.lstm_truncated_length + 1

        json_results = []
        for image_features, _, _, _, video_indices, video_segment_indices, valid_count in \
                data_loader_val.segmental_sampling_iter(batch_size=model_config.batch_size,
                                                        num_segments=model_config.num_segments):

            current_input = initial_input_sequence.copy()
            if not FLAGS.repeated_feed_images:
                current_state = sess.run(
                    fetches="lstm/initial_state:0",
                    feed_dict={"input_features:0": image_features})
            else:
                current_state = sess.run(fetches="lstm/initial_state:0",
                                         feed_dict={})

            generated_sentences =\
                np.zeros((model_config.batch_size, max_sentence_length), dtype=np.int32)
            generated_sentences[:, 0] = current_input
            completed_masks = np.zeros(model_config.batch_size, dtype=np.bool)

            for i in range(const_config.lstm_truncated_length):
                if not FLAGS.repeated_feed_images:
                    softmax_output, next_state = sess.run(
                        fetches=["softmax:0", "lstm/state:0"],
                        feed_dict={
                            "input_feed:0": current_input,
                            "lstm/state_feed:0": current_state
                        })
                else:
                    softmax_output, next_state = sess.run(
                        fetches=["softmax:0", "lstm/state:0"],
                        feed_dict={
                            "input_feed:0": current_input,
                            "lstm/state_feed:0": current_state,
                            "input_features:0": image_features
                        })

                if FLAGS.selection_method == 'sampling':
                    # Sample the next word according to the probability.
                    next_input = []
                    for probs in softmax_output:
                        next_input.append(np.random.choice(vocab_size,
                                                           p=probs))
                    next_input = np.array(next_input)
                elif FLAGS.selection_method == 'argmax':
                    next_input = np.argmax(softmax_output, axis=1)
                else:
                    # TODO: implement beam search
                    next_input = None
                generated_sentences[:, i + 1] = next_input

                # Update input and state.
                current_input = next_input
                current_state = next_state

                # Early stop if we have generated the <END> token for all sentences.
                for j, word_id in enumerate(next_input):
                    if word_id == vocab.end_id:
                        completed_masks[j] = True
                if sum(completed_masks) == model_config.batch_size:
                    break

            # Extract text sentences.
            sentences = []
            for word_id_array in generated_sentences:
                word_id_array = remove_start_end_word_ids(word_id_array, vocab)
                text = vocab.id_array_to_sentence(word_id_array)
                sentences.append(text)
            sentences = sentences[:valid_count]

            for sentence in sentences:
                print sentence

            for i in range(valid_count):
                video_idx = video_indices[i]
                segment_idx = video_segment_indices[i]
                video = data_loader_val.videos[video_idx]
                video_segment = video.video_segments[segment_idx]
                caption_trimmed = remove_start_end_word_ids(
                    video_segment.caption, vocab)
                gt_caption = vocab.id_array_to_sentence(caption_trimmed)
                video_segment_name = video.name + str(segment_idx)
                json_results.append({
                    'name': video_segment_name,
                    'video_caption': sentences[i],
                    'gt_caption': gt_caption
                })
        print('Finished Inference.')

        print('Dumping results...')
        fo = open(FLAGS.output_file, 'w')
        json.dump(json_results, fo, indent=4)
        fo.close()
        print('Done.')
                        type=int,
                        default=VAL_FREQ_DEFAULT,
                        help='Frequency of evaluation on validation set')
    parser.add_argument('--vocab_file',
                        type=str,
                        default=DEFAULT_VOCAB_FILE,
                        help='Default vocabulary file')
    parser.add_argument('--one_hot',
                        type=str,
                        default=ONE_HOT_DEFAULT,
                        help='apply one hot encoding')
    parser.add_argument('--check_freq',
                        type=int,
                        default=CHECKPOINT_FREQ_DEFAULT,
                        help='test and save results ')
    parser.add_argument('--name',
                        type=str,
                        default=MODEL_NAME_DEFAULT,
                        help='model name')
    parser.add_argument('--append',
                        type=bool,
                        default=APPEND_DEFAULT,
                        help='append start,end token')
    FLAGS, unparsed = parser.parse_known_args()

    vocabulary = Vocabulary(FLAGS.vocab_file, None, None, flag='load')
    VOCAB_SIZE = len(vocabulary._vocab)
    start_v = vocabulary.word_to_id("#START#")
    end_v = vocabulary.word_to_id("#END#")
    print(vocabulary.word_to_id('dressing'))
    main(None)
Exemplo n.º 44
0
import atislexicon
from augmentation import Augmenter
import domains
from encoderdecoder import EncoderDecoderModel
from attention import AttentionModel
from example import Example
import spec as specutil
from vocabulary import Vocabulary

MODELS = collections.OrderedDict([
    ('encoderdecoder', EncoderDecoderModel),
    ('attention', AttentionModel),
])

VOCAB_TYPES = collections.OrderedDict([
    ('raw', lambda s, e, **kwargs: Vocabulary.from_sentences(s, e, **kwargs)),
    ('glove', lambda s, e, **kwargs: Vocabulary.from_sentences(
        s, e, use_glove=True, **kwargs))
])

# Global options
OPTIONS = None

# Global statistics
STATS = {}


def _parse_args():
    global OPTIONS
    parser = argparse.ArgumentParser(
        description='A neural semantic parser.',
Exemplo n.º 45
0

from vocabulary import *
import pipeline_lstm
import pipeline_cnn

# pipeline_lstm.train()
# pipeline_lstm.test()
# pipeline_cnn.test()

import data_video
from vocabulary import Vocabulary

vocab_path = data_video.msvd_bilingual_vocab_char_path
vocab = Vocabulary.load(vocab_path)
dataset = data_video.MSVDDatasetBilingual(vocab=vocab, segment_method='char', caption_mode='text', split='train')

dataset.data.sort(key=lambda x: x.video_id)

with open('all_captions.txt', 'w') as f:
    for d in dataset.data:
        f.write('{:>12} {}\n'.format(d.video_id, d.caption))
Exemplo n.º 46
0
import general_utils
import chat_command_handler
from chat_settings import ChatSettings
from chatbot_model import ChatbotModel
from vocabulary import Vocabulary

#Read the hyperparameters and configure paths
_, model_dir, hparams, checkpoint = general_utils.initialize_session("chat")

#Load the vocabulary
print()
print("Loading vocabulary...")
if hparams.model_hparams.share_embedding:
    shared_vocab_filepath = path.join(model_dir,
                                      Vocabulary.SHARED_VOCAB_FILENAME)
    input_vocabulary = Vocabulary.load(shared_vocab_filepath)
    output_vocabulary = input_vocabulary
else:
    input_vocab_filepath = path.join(model_dir,
                                     Vocabulary.INPUT_VOCAB_FILENAME)
    input_vocabulary = Vocabulary.load(input_vocab_filepath)
    output_vocab_filepath = path.join(model_dir,
                                      Vocabulary.OUTPUT_VOCAB_FILENAME)
    output_vocabulary = Vocabulary.load(output_vocab_filepath)

#Create the model
print("Initializing model...")
print()
with ChatbotModel(mode="infer",
                  model_hparams=hparams.model_hparams,
                  input_vocabulary=input_vocabulary,
Exemplo n.º 47
0
def read_instances_from_file(files, max_len=400, keep_case=False):
    ''' Collect instances and construct vocab '''

    vocab = Vocabulary()
    lb_vocab = Vocabulary(need_default=False)
    sets = []

    for file in files:
        sents, labels = [], []
        trimmed_sent = 0
        with open(file) as f:
            lines = f.readlines()
            for l in lines:
                l = l.strip().split('\t')
                if len(l) < 2:
                    continue
                label = l[0]
                sent = l[1]
                if not keep_case:
                    sent = sent.lower()
                word_lst = sent.split()
                if len(word_lst) > max_len:
                    word_lst = word_lst[:max_len]
                    trimmed_sent += 1
                if word_lst:
                    sents.append(word_lst)
                    labels.append(label)
                    vocab.add_word_lst(word_lst)
                    lb_vocab.add_word(label)

        assert len(sents) == len(labels)

        sets.append({'sents': sents, 'labels': labels})

        logger.info('Get {} instances from file {}'.format(len(sents), file))
        if trimmed_sent:
            logger.info(
                '{} sentences are trimmed. Max sentence length: {}.'.format(
                    trimmed_sent, max_len))

    logger.info('Building vocabulary...')
    vocab.add_word_lst(['<cls>'] * 6)
    vocab.build_vocab()
    lb_vocab.build_vocab()
    logger.info('Finished. Size of vocab: {}. # Class: {}.'.format(
        len(vocab), len(lb_vocab)))
    logger.info('<pad>: {}'.format(vocab.to_index('<pad>')))
    logger.info('<unk>: {}'.format(vocab.to_index('<unk>')))
    logger.info('<cls>: {}'.format(vocab.to_index('<cls>')))

    return sets, vocab, lb_vocab
Exemplo n.º 48
0
def build_vocabs():
    tasks = [
        '.'.join([id, syn]) for id in ['autoid', 'goldid']
        for syn in ['autosyn', 'goldsyn']
    ]
    stypes = ['train', 'dev', 'test']

    loader = StreusleLoader()
    STREUSLE_BASE = os.environ.get(
        'STREUSLE_BASE'
    ) or '/cs/usr/aviramstern/nlp/datasets/streusle_v4/release'
    all_files = [
        STREUSLE_BASE + '/' + stype + '/streusle.ud_' + stype + '.' + task +
        '.json' for task in tasks for stype in stypes
    ]
    records = sum([loader.load(f, input_format='json') for f in all_files], [])
    samples = [streusle_record_to_lstm_model_sample(r) for r in records]

    pp_vocab = Vocabulary('PREPS')
    pp_vocab.add_words(
        set([
            x.token for s in samples for x, y in zip(s.xs, s.ys)
            if any([y.supersense_role, y.supersense_func])
        ]))

    ner_vocab = Vocabulary('NERS')
    ner_vocab.add_words(
        set([x.ner for s in samples for x, y in zip(s.xs, s.ys)]))
    ner_vocab.add_word(None)

    lemmas_vocab = Vocabulary('LEMMAS')
    lemmas_vocab.add_words(
        set([x.lemma for s in samples for x, y in zip(s.xs, s.ys)]))

    ud_dep_vocab = Vocabulary('UD_DEPS')
    ud_dep_vocab.add_words(
        set([x.ud_dep for s in samples for x, y in zip(s.xs, s.ys)]))
    ud_dep_vocab.add_word(None)

    ud_xpos_vocab = Vocabulary('UD_XPOS')
    ud_xpos_vocab.add_words(
        set([x.ud_xpos for s in samples for x, y in zip(s.xs, s.ys)]))
    ud_xpos_vocab.add_word(None)

    token_vocab = Vocabulary('TOKENS')
    token_vocab.add_words(
        set([x.token for s in samples for x, y in zip(s.xs, s.ys)]))

    govobj_config_vocab = Vocabulary('GOVOBJ_CONFIGS')
    govobj_config_vocab.add_words(
        set([x.govobj_config for s in samples for x, y in zip(s.xs, s.ys)]))

    pss_vocab = Vocabulary('PSS')
    pss_vocab.add_words(supersense_repo.PREPOSITION_SUPERSENSES_SET)
    pss_vocab.add_word(None)

    pss_vocab = Vocabulary('LEXCAT')
    pss_vocab.add_words(
        set([x.lexcat for s in samples for x, y in zip(s.xs, s.ys)]))

    return [
        pp_vocab, ner_vocab, lemmas_vocab, ud_dep_vocab, ud_xpos_vocab,
        token_vocab, pss_vocab, govobj_config_vocab
    ]
Exemplo n.º 49
0
def main():
    vocabulary = Vocabulary()
    hangman = Hangman(vocabulary)
    hangman.startGame()
Exemplo n.º 50
0
dev_e_path = '../data/validation/dev.e.gz'
dev_f_path = '../data/validation/dev.f.gz'
dev_wa = '../data/validation/dev.wa.nonullalign'

test_e_path = '../data/test/test.e.gz'
test_f_path = '../data/test/test.f.gz'
test_wa = '../data/test/test.wa.nonullalign'

# Using only 1000 words will result in many UNKs, but
# it will make training a lot faster.
# If you have a fast computer, a GPU, or a lot of time,
# try with 10000 instead.
max_tokens = 1000

corpus_e = smart_reader(train_e_path)
vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens)
pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb"))
print("English vocabulary size: {}".format(len(vocabulary_e)))

corpus_f = smart_reader(train_f_path)
vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens)
pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb"))
print("French vocabulary size: {}".format(len(vocabulary_f)))

# load test corpus
test_corpus = list(
    bitext_reader(smart_reader(test_e_path), smart_reader(test_f_path)))

# run
tf.reset_default_graph()
Exemplo n.º 51
0
from vocabulary import Vocabulary

NERS = Vocabulary('NERS', [
    'DATE', 'ORGANIZATION', 'O', 'ORDINAL', 'TIME', 'NUMBER', 'MONEY',
    'PERCENT', 'MISC', 'PERSON', 'LOCATION', 'DURATION', 'SET', None
])
Exemplo n.º 52
0
    return options, pattern


if __name__ == '__main__':
    import os
    import glob
    import pprint
    from vocabulary import Vocabulary
    import parallelize

    options, pattern = parse_args()

    olddir = os.getcwd()
    os.chdir(options.datadir)

    fnames = glob.glob(pattern)

    nprocesses = len(fnames) if options.parallel else None
    results = parallelize.run(process_file, fnames, nprocesses, options)

    full_counter = Counter()
    for counter in results:
        full_counter.update(counter)

    vocabulary = Vocabulary(full_counter, n_most_common=options.nwords)
    vocabulary.save('index')

    pprint.pprint(full_counter.most_common(200))
    print(len(full_counter))
    print(vocabulary)
    os.chdir(olddir)
Exemplo n.º 53
0
    def import_vocabulary(self,
                          vocabulary_dir,
                          normalize=True,
                          import_mode=VocabularyImportMode.External,
                          dataset_vocab=None):

        if dataset_vocab is None and import_mode != VocabularyImportMode.External:
            raise ValueError(
                "dataset_vocab must be provided if import_mode is not 'External'."
            )

        import_stats = VocabularyImportStats()

        #Read the external vocabulary tokens and embeddings
        tokens_with_embeddings = self._read_vocabulary_and_embeddings(
            vocabulary_dir)

        #If normalize flag is true, normalize casing of the external vocabulary and average embeddings for any resulting duplicate tokens
        if normalize:
            tokens_with_embeddings = self._normalize_tokens_with_embeddings(
                tokens_with_embeddings)

        import_stats.external_vocabulary_size = len(tokens_with_embeddings)

        #Apply dataset filters if applicable
        if dataset_vocab is not None:
            import_stats.dataset_vocabulary_size = dataset_vocab.size()

            if import_mode == VocabularyImportMode.ExternalIntersectDataset or import_mode == VocabularyImportMode.Dataset:
                #Get rid of all tokens that exist in the external vocabulary but don't exist in the dataset
                for token in list(tokens_with_embeddings.keys()):
                    if not dataset_vocab.word_exists(token):
                        del tokens_with_embeddings[token]
                import_stats.intersection_size = len(tokens_with_embeddings)

            if import_mode == VocabularyImportMode.ExternalUnionDataset or import_mode == VocabularyImportMode.Dataset:
                #Add any tokens that exist in the dataset but don't exist in the external vocabulary.
                #These added tokens will get word vectors sampled from the gaussian distributions of their components:
                #   where the mean of each component is the mean of that component in the external embedding matrix
                #   and the standard deviation of each component is the standard deviation of that component in the external embedding matrix
                embeddings_matrix = np.array(list(
                    tokens_with_embeddings.values()),
                                             dtype=np.float32)
                emb_size = embeddings_matrix.shape[1]
                emb_mean = np.mean(embeddings_matrix, axis=0)
                emb_stdev = np.std(embeddings_matrix, axis=0)
                for i in range(dataset_vocab.size()):
                    dataset_token = dataset_vocab.int2word(i,
                                                           capitalize_i=False)
                    if dataset_token not in tokens_with_embeddings:
                        tokens_with_embeddings[
                            dataset_token] = np.random.normal(
                                emb_mean, emb_stdev, emb_size)

        if len(tokens_with_embeddings) == 0:
            raise ValueError(
                "Imported vocabulary size is 0. Try a different VocabularyImportMode (currently {0})"
                .format(VocabularyImportMode(import_mode).name))

        tokens, embeddings_matrix = zip(*tokens_with_embeddings.items())
        embeddings_matrix = np.array(embeddings_matrix, dtype=np.float32)

        #Create the vocabulary instance
        vocabulary = Vocabulary(external_embeddings=embeddings_matrix)
        for i in range(len(tokens)):
            vocabulary.load_word(tokens[i], i)
        vocabulary.compile(loading=True)
        return vocabulary, import_stats
Exemplo n.º 54
0
class Model(object):
    def __init__(self):
        self.vocab = Vocabulary()
        self.language_module = dataset.LSTMLanguageModule(
            message_flags.flattened_message_size(),
            self.vocab.get_vocab_size()).to(device)
        self.training_examples = []

        self.encoder = pretrain.load_saved_encoder().to(device)
        self.encoder.eval()
        self.decoder = pretrain.load_saved_decoder().to(device)
        self.decoder.eval()

        params_to_train = list(self.language_module.parameters())
        if FLAGS.model_train_decoder:
            params_to_train.extend(list(self.decoder.parameters()))
        self.optimizer = optim.Adam(params_to_train, weight_decay=1e-5)

    def predict(self, state, command):
        self.language_module.eval()
        self.decoder.eval()
        token_ids = self.vocab.token_ids(command)
        command_variable = torch.LongTensor(token_ids).unsqueeze(0).to(device)
        state_variable = dataset.state_to_variable(state).to(device)
        encoder_output = self.language_module.forward(command_variable)
        decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
            encoder_output)
        prediction = self.decoder.forward(state_variable, decoder_input)
        return dataset.output_from_variable(prediction, state)

    def optimizer_step(self):
        self.language_module.train()
        self.decoder.train()
        random.shuffle(self.training_examples)
        for batch in util.batch_iterator(self.training_examples,
                                         FLAGS.model_batch_size):
            states = [s for s, c, t, m in batch]
            commands = [c for s, c, t, m in batch]
            targets = [t for s, c, t, m in batch]
            target_messages = [m for s, c, t, m in batch]
            self.optimizer.zero_grad()
            target_message = torch.from_numpy(
                np.concatenate(target_messages, 0)).to(device)
            state_variable = dataset.state_to_variable_batch(states).to(device)
            target_variable = dataset.output_to_variable_batch(
                targets, states).to(device)
            max_command_len = max(len(c) for c in commands)
            token_ids = np.zeros((len(commands), max_command_len),
                                 dtype=np.int64)
            for i, command in enumerate(commands):
                ids = self.vocab.token_ids(command)
                token_ids[i, -len(ids):] = ids
            command_variable = torch.from_numpy(token_ids).to(device)

            encoder_output = self.language_module.forward(command_variable)
            decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
                encoder_output)

            prediction = self.decoder.forward(state_variable, decoder_input,
                                              target_variable)

            if FLAGS.continuous_message:
                error = encoder_output - target_message
                message_loss = (error * error).sum()
            else:
                log_message_probs = F.log_softmax(
                    encoder_output.view(-1, FLAGS.discrete_message_size,
                                        FLAGS.discrete_message_symbols), 2)
                target_message_reshaped = target_message.view(
                    -1, FLAGS.discrete_message_size,
                    FLAGS.discrete_message_symbols)
                message_loss = -(log_message_probs *
                                 target_message_reshaped).sum()

            loss = dataset.loss(
                prediction, target_variable
            ) + FLAGS.model_message_loss_weight * message_loss
            loss = loss / len(batch)  # avg instead of sum

            loss.backward()
            self.optimizer.step()

    def training_accuracy(self):
        n_correct = 0
        for state, command, target, target_message in self.training_examples:
            prediction = self.predict(state, command)
            if prediction == target:
                n_correct += 1
        return n_correct / len(self.training_examples)

    def update(self, state, command, target_output, num_updates=None):
        if num_updates is None:
            num_updates = FLAGS.model_max_updates
        state_variable = dataset.state_to_variable(state).to(device)
        target_variable = dataset.output_to_variable(target_output,
                                                     state).to(device)
        encoder_output = self.encoder.forward(state_variable, target_variable)
        target_message = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
            encoder_output)
        target_message = target_message.cpu().detach().numpy()

        self.training_examples.append(
            (state, command, target_output, target_message))
        for _ in range(num_updates):
            self.optimizer_step()
Exemplo n.º 55
0
    if title != None:
        ax.set_title(title)
    ax.imshow(image)

    return ax


vocab_threshold = 5
vocab_file = './vocab.pkl'
start_word = "<start>"
end_word = "<end>"
unk_word = "<unk>"
annotations_file = os.path.join(
    '/home/george/', 'cocoapi/annotations/image_info_test2014.json')

vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word,
                   annotations_file, True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder_file = 'encoder-2.pkl'
decoder_file = 'decoder-2.pkl'

embed_size = 256
hidden_size = 512

vocab_size = len(vocab)

encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder.eval()
Exemplo n.º 56
0
from vocabulary import Vocabulary

UD_DEPS = Vocabulary('UD_DEPS', [
    'ROOT', 'mark', 'obj', 'amod', 'dep', 'cop', 'appos', 'advmod', 'conj',
    'cc', 'nsubjpass', 'compound', 'aux:pass', 'iobj', 'nsubj', 'root',
    'nmod:tmod', 'ccomp', 'aux', 'cc:preconj', 'nsubj:pass', 'nmod', 'neg',
    'acl', 'fixed', 'dobj', 'xcomp', 'auxpass', 'reparandum', 'det',
    'discourse', 'vocative', 'flat', 'csubj:pass', 'obl', 'obl:tmod', 'punct',
    'compound:prt', 'csubjpass', 'nummod', 'mwe', 'csubj', 'list', 'nmod:poss',
    'advcl', 'obl:npmod', 'dislocated', 'orphan', 'expl', 'acl:relcl',
    'nmod:npmod', 'goeswith', 'det:predet', 'case', 'parataxis', None
])
Exemplo n.º 57
0
    def chat(self, question, chat_settings):

        if chat_settings.enable_auto_punctuation:
            question = Vocabulary.auto_punctuate(question)
        question = Vocabulary.clean_text(
            question,
            normalize_words=chat_settings.inference_hparams.normalize_words)
        question = self.input_vocabulary.words2ints(question)

        question_with_history = []
        for i in range(len(self.conversation_history)):
            question_with_history += self.conversation_history[i] + [
                self.input_vocabulary.eos_int()
            ]
        question_with_history += question

        #Get the answer prediction
        batch = np.zeros((1, len(question_with_history)))
        batch[0] = question_with_history
        max_output_sequence_length = chat_settings.inference_hparams.max_answer_words + 1  # + 1 since the EOS token is counted as a timestep
        predicted_answer_info = self.predict_batch(
            inputs=batch,
            input_sequence_length=np.array([len(question_with_history)]),
            max_output_sequence_length=max_output_sequence_length,
            beam_length_penalty_weight=chat_settings.inference_hparams.
            beam_length_penalty_weight,
            sampling_temperature=chat_settings.inference_hparams.
            sampling_temperature,
            log_summary=chat_settings.inference_hparams.log_summary)

        #Read the answer prediction
        answer_beams = []
        if self.beam_width > 0:
            #For beam search decoding: if show_all_beams is enabeled then output all beams (sequences), otherwise take the first beam.
            #   The beams (in the "predictions" matrix) are ordered with the highest ranked beams first.
            beam_count = 1 if not chat_settings.show_all_beams else len(
                predicted_answer_info["predictions_seq_lengths"][0])
            for i in range(beam_count):
                predicted_answer_seq_length = predicted_answer_info[
                    "predictions_seq_lengths"][0][
                        i] - 1  #-1 to exclude the EOS token
                predicted_answer = predicted_answer_info["predictions"][
                    0][:predicted_answer_seq_length, i].tolist()
                answer_beams.append(predicted_answer)
        else:
            #For greedy / sampling decoding: only one beam (sequence) is returned, based on the argmax for greedy decoding
            #   or the sampling distribution for sampling decoding. Return this beam.
            beam_count = 1
            predicted_answer_seq_length = predicted_answer_info[
                "predictions_seq_lengths"][0] - 1  #-1 to exclude the EOS token
            predicted_answer = predicted_answer_info["predictions"][
                0][:predicted_answer_seq_length].tolist()
            answer_beams.append(predicted_answer)

        #Add new conversation steps to the end of the history and trim from the beginning if it is longer than conv_history_length
        #Answers need to be converted from output_vocabulary ints to input_vocabulary ints (since they will be fed back in to the encoder)
        self.conversation_history.append(question)
        answer_for_history = self.output_vocabulary.ints2words(
            answer_beams[0], is_punct_discrete_word=True, capitalize_i=False)
        answer_for_history = self.input_vocabulary.words2ints(
            answer_for_history)
        self.conversation_history.append(answer_for_history)
        self.trim_conversation_history(
            chat_settings.inference_hparams.conv_history_length)

        #Convert the answer(s) to text and return
        answers = []
        for i in range(beam_count):
            answer = self.output_vocabulary.ints2words(answer_beams[i])
            answers.append(answer)

        q_with_hist = None if not chat_settings.show_question_context else self.input_vocabulary.ints2words(
            question_with_history)
        if chat_settings.show_all_beams:
            return q_with_hist, answers
        else:
            return q_with_hist, answers[0]
Exemplo n.º 58
0
class Model(object):
    def __init__(self):
        self.vocab = Vocabulary()
        self.language_module = dataset.LSTMLanguageModule(
            message_flags.flattened_message_size(),
            self.vocab.get_vocab_size()).to(device)
        self.decoder = dataset.Decoder(
            message_flags.flattened_message_size()).to(device)
        all_params = list(self.language_module.parameters()) + list(
            self.decoder.parameters())
        self.optimizer = optim.Adam(all_params, weight_decay=1e-5)
        self.training_examples = []

    def predict(self, state, command):
        self.language_module.eval()
        self.decoder.eval()
        token_ids = self.vocab.token_ids(command)
        command_variable = torch.LongTensor(token_ids).unsqueeze(0).to(device)
        state_variable = dataset.state_to_variable(state).to(device)
        encoder_output = self.language_module.forward(command_variable)
        decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
            encoder_output)
        prediction = self.decoder.forward(state_variable, decoder_input)
        return dataset.output_from_variable(prediction, state)

    def optimizer_step(self):
        self.language_module.train()
        self.decoder.train()
        random.shuffle(self.training_examples)
        for state, command, target in self.training_examples:
            self.optimizer.zero_grad()
            state_variable = dataset.state_to_variable(state).to(device)
            target_variable = dataset.output_to_variable(target,
                                                         state).to(device)
            token_ids = self.vocab.token_ids(command)
            command_variable = torch.LongTensor(token_ids).unsqueeze(0).to(
                device)

            encoder_output = self.language_module.forward(command_variable)
            decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
                encoder_output)
            prediction = self.decoder.forward(state_variable, decoder_input,
                                              target_variable)

            loss = dataset.loss(prediction, target_variable)

            loss.backward()
            self.optimizer.step()

    def training_accuracy(self):
        n_correct = 0
        for state, command, target in self.training_examples:
            prediction = self.predict(state, command)
            if prediction == target:
                n_correct += 1
        return n_correct / len(self.training_examples)

    def update(self, state, command, target_output, num_updates=None):
        if num_updates is None:
            num_updates = FLAGS.baseline_max_updates
        self.training_examples.append((state, command, target_output))
        for _ in range(num_updates):
            self.optimizer_step()
Exemplo n.º 59
0
	def rewrite(self):
		""" Rewrite the flight according to the vocabulary voc (voc is a Vocabulary)"""
		rw=[]
		for part in self.vocabulary.getPartitions():
			for partelt in part.getModalities():
				val=self.getValue(part.getAttName())
				mu = partelt.getMu(val)
				rw.append(mu)
		return rw

	def satisfaisant(self, conditions):

		for condition in conditions:
			part = self.vocabulary.getPartition(condition[0])
			partelt = part.getModality(condition[1])
			val = self.getValue(part.getAttName())
			mu = partelt.getMu(val)
			if (mu < condition[2]):
				return False
		return True

if __name__ == "__main__":
	if len(sys.argv)  < 2:
		print("Usage: python flight.py <vocfile.csv>")
	else:
		if os.path.isfile(sys.argv[1]):
			voc = Vocabulary(sys.argv[1])
			line= "2008,1,3,4,1103,1955,2211,2225,WN,335,N712SW,128,150,116,-14,8,IAD,TPA,810,4,8,0,,0,NA,NA,NA,NA,NA"
			f = Flight(line,voc)
			print(f.rewrite())
Exemplo n.º 60
0
if __name__ == '__main__':
    arguments = parse_args()

    logger.info('Loading config')
    with open(arguments.config) as config_file:
        config = yaml.load(config_file)

    logger.info('Initializing input stream')
    input_stream = LineSentence(
        arguments.corpus,
        max_sentence_length=config['sliding_window']['change_every_words'])

    min_word_freq = config['vocabulary']['min_freq']
    logger.info('Building vocabulary with min_freq={}'.format(min_word_freq))
    vocab = Vocabulary.from_documents(input_stream, min_word_freq)

    vocabulary_size = len(vocab)
    logger.info('Vocabulary size: {}'.format(vocabulary_size))

    logger.info('Building negative sampling distribution')
    negative_sampler = HierarchicalSampler(
        vocab=vocab,
        alpha=config['negative_sampling']['alpha'],
        chunks_num=config['negative_sampling']['vocab_chunks_num'])

    logger.info('Building model computation graph')
    optimizer = tf.train.AdagradOptimizer(
        learning_rate=config['training_params']['initial_learning_rate'])

    negative_samples_num = config['sliding_window']['max_size'] * \