示例#1
0
def test_models_have_correct_lambda_size():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    for i in range(0, lm.n - 2):
        model = lm.models[i]
        assert len(model.lambdas) == len(model.hist_words_dct)
示例#2
0
def test_models_have_correct_n():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    for i in range(0, lm.n - 2):
        model = lm.models[i]
        assert model.n == i + 2
示例#3
0
def test_perplexity_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    perp = round(lm.perplexity(2, math.log(0.5)), 5)
    correct = round(math.sqrt(2), 5)
    assert perp == correct
示例#4
0
class State:
    keywords = {}
    def __init__(self):
        logging.info("Instantiating State class: %s" % self.__class__.__name__)
        
        # Add keywords from superclasses
        self.keywords = State.fold_keywords( self.__class__, self.keywords)

        # If the State doesn't have a LanguageModel set, then
        # Automatically create LanguageModel specific to the keywords of this State
        if not hasattr(self,'lm'):
            logging.info("We need to create a LanguageModel for this State")
            commands_array = self.keywords.keys()
            self.lm = LanguageModel(self.__class__.__name__,commands_array)
            self.lm.update_all()
            logging.info("LanguageModel created")
    
    @staticmethod
    def fold_keywords(clazz, keywords):
        for base in clazz.__bases__:
            keywords.update( State.fold_keywords(base, base.keywords))
        return keywords

    def process(self, text):
        state_change = []
        if text in self.keywords:
            state_change = self.keywords[text]
            if type(state_change) not in [list,tuple]:
                state_change = [ state_change ]
        logging.info('Processed text = %s with result = %s' % (text, state_change))
        return state_change
示例#5
0
def test_models_have_correct_beginning_grams():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert sorted(lm.models[0].beginning_grams) \
     == sorted(['this', 'shall', 'PAD'])
    assert sorted(lm.models[1].beginning_grams) \
     == sorted(['PAD this', 'this text', 'PAD PAD', 'shall train'])
示例#6
0
    def __init__(self):
        super().__init__()

        self.model_lm = LanguageModel()
        self.model_ct = ContentTransfer()
        self.kb = KnowledgeBase()
        self.ranker = Ranker(self.model_lm)
        self.local = True
示例#7
0
文件: hmm1.py 项目: TPLink32/nlp
 def __init__(self):
     self.lm = LanguageModel('RenMinData.txt')
     self.dict = {}
     self.words = []
     self.max_len_word = 0
     self.load_dict('dict.txt')
     self.graph = None
     self.viterbi_cache = {}
示例#8
0
def test_train_creates_expected_hist_words_dict():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    assert sorted(list(model.hist_words_dct.keys())) \
     == sorted(['PAD', 'this', 'text', 'shall', 'train', '.'])
    assert list(model.hist_words_dct['this'].keys()) == ['text']
    assert list(model.hist_words_dct['text'].keys()) == ['.']
    assert list(model.hist_words_dct['shall'].keys()) == ['train']
    assert list(model.hist_words_dct['train'].keys()) == ['text']
    assert list(model.hist_words_dct['PAD'].keys()) == ['this']
    assert sorted(list(model.hist_words_dct['.'].keys())) \
     == sorted(['PAD', 'shall'])
示例#9
0
def test_kn_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.kn_evaluate(['text', 'shall', 'train']) == -2.0770634192748685
    assert lm.kn_evaluate(['this', 'text', 'dog']) == -3.1656313103493887
    assert lm.kn_evaluate(['the', 'brown', 'cat']) == -2.4724841297894433
示例#10
0
def test_laplace_produces_expected_values():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.laplace_evaluate(['this', 'shall', 'train', 'PAD']) \
     == -2.890371757896165
    assert lm.laplace_evaluate(['dog', 'text', '.', 'PAD']) \
     == (math.log(1 / 9) + math.log(1 / 2))
示例#11
0
    def __init__(self):
        logging.info("Instantiating State class: %s" % self.__class__.__name__)
        
        # Add keywords from superclasses
        self.keywords = State.fold_keywords( self.__class__, self.keywords)

        # If the State doesn't have a LanguageModel set, then
        # Automatically create LanguageModel specific to the keywords of this State
        if not hasattr(self,'lm'):
            logging.info("We need to create a LanguageModel for this State")
            commands_array = self.keywords.keys()
            self.lm = LanguageModel(self.__class__.__name__,commands_array)
            self.lm.update_all()
            logging.info("LanguageModel created")
示例#12
0
class DialogBackendLocal(DialogBackend):
    def __init__(self):
        super().__init__()

        self.model_lm = LanguageModel()
        self.model_ct = ContentTransfer()
        self.kb = KnowledgeBase()
        self.ranker = Ranker(self.model_lm)
        self.local = True

    def predict(self, context, max_n=1):
        print('backend running, context = %s' % context)
        query = self.get_query(context)

        # get results from different models
        results = self.model_lm.predict(context)

        passages = []
        url_snippet = []
        for line in open('args/kb_sites.txt', encoding='utf-8'):
            cust = line.strip('\n')
            kb_args = {'domain': 'cust', 'cust': cust, 'must_include': []}
            url_snippet.append(self.kb.predict(query, args=kb_args)[0])
            passage = ' ... '.join([snippet for _, snippet in url_snippet])
            passages.append((passage, query))

        for passage, kb_query in passages:
            results += self.model_ct.predict(kb_query, passage)

        # rank hyps from different models

        hyps = [hyp for _, _, hyp in results]
        scored = self.ranker.predict(context, hyps)
        ret = []
        for i, d in enumerate(scored):
            d['way'], _, d['hyp'] = results[i]
            ret.append((d['score'], d))
        ranked = [d for _, d in sorted(ret, reverse=True)]
        if max_n > 0:
            ranked = ranked[:min(len(ranked), max_n)]
        return ranked, url_snippet
示例#13
0
def main():
    p = get_argparser()
    args = p.parse_args()

    lm = LanguageModel()
    lm.configure_logger(level=logging.DEBUG if args.DEBUG else logging.INFO,
                        write_file=True)

    if args.train and args.data_path:
        lm.train(args.data_path,
                 output_path=args.train,
                 learning_rate=args.learning_rate,
                 hidden_size=args.hidden_size,
                 batch_size=args.batch_size,
                 max_epoch=args.max_epoch)

    elif args.test and args.data_path:
        lm.predict(args.test, args.data_path)

    else:
        # Well, this is silly.
        p.print_help()
        exit(2)
示例#14
0
def test_subsequent_training():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    model = lm.models[-1]
    wh1_len = len(model.word_hists_dct)
    hw1_len = len(model.hist_words_dct)
    data = tokenize('This sample.')
    lm.train(data)
    model = lm.models[-1]
    wh2_len = len(model.word_hists_dct)
    hw2_len = len(model.hist_words_dct)
    assert wh2_len - wh1_len == 1
    assert hw2_len - hw1_len == 1
    assert sorted(list(model.word_hists_dct['.'].keys())) \
     == sorted(['text', 'sample'])
    assert sorted(list(model.hist_words_dct['this'].keys())) \
     == sorted(['text', 'sample'])
示例#15
0
# -*- coding: utf-8 -*-
from lm import LanguageModel
from memoize import Memoize

lm = LanguageModel()


def splits(text, max_len=10):
    return [(text[:i + 1], text[i + 1:])
            for i in range(min(len(text), max_len))]


@Memoize
def segment(text):
    text = text.strip()
    if not text:
        return []

    candidates = [[left] + segment(right) for left, right in splits(text)]
    return max(candidates, key=lm.get_words_prob)


if __name__ == '__main__':
    test = [
        'colorlessgreenideassleepfuriously.', 'ihaveadream.',
        'howtotrainadragon.', 'canwetakeaphotoofyou?'
    ]

    for text in test:
        words = segment(text)
        print(text)
 def test_ngram(self):
     result = LanguageModel(2).get_ngrams(["hello", "world", "lmao"])
     self.assertEqual(result, [(None, 'hello'), ('hello', 'world'),
                               ('world', 'lmao'), ('lmao', None)])
示例#17
0
 def test_update_all(self):
     lm = LanguageModel('playing')
     lm.update_all( True)
     lm.reset_files()
示例#18
0
def test_lm_has_correct_number_tokens_and_unigram_types():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.num_tokens == 7
    assert len(lm.unigrams) == 5
示例#19
0
def test_discount():
    lm = LanguageModel(2)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.discount == 0.75
示例#20
0
    else:
        source = Reader(options.input)

    if options.output == '-':
        writer = sys.stdout
    else:
        writer = Writer(options.output)

    if debug:
        rules.DEBUG = 1

    config = Config(options.config)
    if logger.level <= logging.INFO:
        config.write(sys.stderr)

    lm = LanguageModel(config.lm_file, config.lm_order)
    rule_table = RuleTable.load(config.rule_table_file, lm, config)

    extra_feature_funcs = build_extra_feature_funcs(config)
    recombination_checker = CombinedRecombinationChecker(extra_feature_funcs)
    decoder = CKYDecoder(config,
                         rule_table,
                         lm,
                         recombination_checker=recombination_checker,
                         extra_feature_funcs=extra_feature_funcs,
                         checking_hypo=checking,
                         expend_loser=expend_loser)

    logger.info('Start decoding...')

    def translate(data):
示例#21
0
    parser.add_argument('--vocab_len', type=float, default=19800, dest='vocab_len')
    parser.add_argument('--lr', type=float, default=1e-3, dest='lr')
    parser.add_argument('--minibatch_size', type=int, default=64, dest='minibatch_size')
    parser.add_argument('--num_epochs', type=int, default=30, dest='num_epochs')
    parser.add_argument('--models_folder', default='../lm_models', dest='folder')
    parser.add_argument('--graph_folder', default='../lm_graph', dest='graphs')
    args = parser.parse_args()
    
    # Fit the model

    if args.mode == 'train':
        # Read the initial word vectors
        train_data = np.load(open('lm_train_data.npy','r'))
        train_labels = np.load(open('lm_train_labels.npy','r'))
        
        lm = LanguageModel(args.lr, args.num_steps, args.vocab_len, args.minibatch_size)
        init = tf.global_variables_initializer()

        with tf.Session() as sess:
            sess.run(init)
            lm.fit(sess, train_data, train_labels, num_epochs=args.num_epochs, folder=args.folder, graph_folder=args.graphs)
    else:
        tweets = dill.load(open("tweets", "rb"))
        w2i = dill.load(open("w2i","rb"))
        i2w = dill.load(open("i2w","rb"))
        word_vector = dill.load(open("word_vecs","rb"))

        start_wd = ["president", "@netanyahu", "democrats", "gop", "congress", "white", "my", "the", "#makeamericagreatagain" ,"republicans", "wall", "@realdonaldtrump", "crooked"]
        input_list = [np.array([[word_vector[w2i[item]]]]) for item in start_wd]

        model = LanguageModel(args.lr, args.num_steps, args.vocab_len, args.minibatch_size)
示例#22
0
文件: hmm1.py 项目: TPLink32/nlp
class DPSplit(object):
    #  动态规划分词"""
    def __init__(self):
        self.lm = LanguageModel('RenMinData.txt')
        self.dict = {}
        self.words = []
        self.max_len_word = 0
        self.load_dict('dict.txt')
        self.graph = None
        self.viterbi_cache = {}

    def get_key(self, t, k):
        return '_'.join([str(t), str(k)])

    def load_dict(self, file):
        with open(file, 'r') as f:
            for line in f:
                word_list = [
                    w.encode('utf-8')
                    for w in list(line.strip().decode('utf-8'))
                ]
                if len(word_list) > 0:
                    self.dict[''.join(word_list)] = 1
                    if len(word_list) > self.max_len_word:
                        self.max_len_word = len(word_list)

    def createGraph(self):
        #  根据输入的句子创建有向图"""
        self.graph = Graph()
        for i in range(len(self.words)):
            self.graph.sequence.append({})
        word_length = len(self.words)
        # 为每一个字所在的位置创建一个可能词集合
        for i in range(word_length):
            for j in range(self.max_len_word):
                if i + j + 1 > len(self.words):
                    break
                word = ''.join(self.words[i:i + j + 1])
                if word in self.dict:
                    node = Node(word)
                    # 按照该词的结尾字为其分配位置
                    self.graph.sequence[i + j][word] = node
        # 增加一个结束空节点,方便计算
        end = Node('#')
        self.graph.sequence.append({'#': end})
        # for s in self.graph.sequence:
        #   for i in s.values():
        #     print i.word,
        #   print ' - '
        # exit(-1)

    def split(self, sentence):
        self.words = [
            w.encode('utf-8') for w in list(sentence.decode('utf-8'))
        ]
        self.createGraph()
        # 根据viterbi动态规划算法计算图中的所有节点最大分数
        self.viterbi(len(self.words), '#')
        # 输出分支最大的节点
        end = self.graph.sequence[-1]['#']
        node = end.prev_node
        result = []
        while node:
            result.insert(0, node.word)
            node = node.prev_node
        print(''.join(self.words))
        print(' '.join(result))

    def viterbi(self, t, k):
        """第t个位置,是单词k的最优路径概率"""
        if self.get_key(t, k) in self.viterbi_cache:
            return self.viterbi_cache[self.get_key(t, k)]
        node = self.graph.sequence[t][k]
        # t = 0 的情况,即句子第一个字
        if t == 0:
            node.max_score = self.lm.get_init_prop(k)
            self.viterbi_cache[self.get_key(t, k)] = node.max_score
            return node.max_score
        prev_t = t - len(k.decode('utf-8'))
        # 当前一个节点的位置已经超出句首,则无需再计算概率
        if prev_t == -1:
            return 1.0
        # 获得前一个状态所有可能的汉字
        pre_words = self.graph.sequence[prev_t].keys()
        for l in pre_words:
            # 从l到k的状态转移概率
            state_transfer = self.lm.get_trans_prop(k, l)
            # 当前状态的得分为上一个最优路径的概率乘以当前的状态转移概率
            score = self.viterbi(prev_t, l) * state_transfer
            prev_node = self.graph.sequence[prev_t][l]
            cur_score = score + prev_node.max_score
            if cur_score > node.max_score:
                node.max_score = cur_score
                # 把当前节点的上一最优节点保存起来,用来回溯输出
                node.prev_node = self.graph.sequence[prev_t][l]
        self.viterbi_cache[self.get_key(t, k)] = node.max_score
        return node.max_score
示例#23
0
                waited += 1
                if waited >= patience:
                    break
            era_index += 1
            era_loss = 0.
            era_samples = 0

    torch.save(checkpoint, os.path.join(save_dir, f"{era_index}_eras.pt"))
    return checkpoint


if __name__ == "__main__":
    vocab_path = "data/vocab.txt"
    in_tokens = 2
    embedding_size = 128
    with open(vocab_path) as r:
        vocab = list(map(lambda l: l.strip(), r.readlines()))
    assert len(vocab) == len(set(vocab))
    vocab_size = len(vocab) + 1

    model = LanguageModel(in_tokens, vocab_size, embedding_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    train(model,
          optimizer,
          vocab, ["data/parted/0.txt"], ["data/parted/1.txt"],
          batch_size=32,
          max_train_eras=100,
          batches_per_era=100,
          max_val_batches=10)
示例#24
0
def test_models_have_correct_vocab_size():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert (lm.models[0].ngram_vocab_size == 7)
    assert (lm.models[1].ngram_vocab_size == 9)
示例#25
0
def test_kn_produces_expected_values_n4():
    lm = LanguageModel(4)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.kn_evaluate(['shall', 'train', 'text',
                           '.']) == -0.7742507185722116
示例#26
0
 def test_create_new(self):
     lm = LanguageModel('playing')
     self.assertFalse( lm.is_ready())
示例#27
0
        'CANCEL': 'Idle'
    }
class PlayingMedia(Base):
    keywords = {
        'STOP': (lambda: rc.stop_playing(), 'Idle'),
        'PAUSE': lambda: rc.pause()
    }
    context = {
        'menu': 'Idle'
    }

# Create a LanguageModel that supports all the keywords defined in all the States
keywords = []
for state in [Base,Idle,SelectMedia,PlayingMedia]:
    keywords += state.keywords.keys()
all_state_lm = LanguageModel('all_state_lm', keywords)
all_state_lm.update_all()
Base.lm = all_state_lm
   
#########################################
# Old states - OBSOLETE
#########################################
    
#class InitialState(State):
#    lm = ManualLanguageModel('initial')   # Overrides automatic creation of language model
#    keywords = {
#        'MARY': 'Listening'
#    }
#
#class Listening(State):
#    keywords = {
示例#28
0
 def test_get_input_commands(self):
     lm = LanguageModel('playing')
     self.assertEqual( len(lm.get_input_commands()), 7)
     # Exercise reading again, just in case the second time triggers a caching error
     lm.get_input_commands()
示例#29
0
def test_p_next_sums_to_one():
    lm = LanguageModel(3)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert sum(lm.p_next(['this', 'text']).values()) == 1
示例#30
0
def test_laplace_produces_expected_values2():
    lm = LanguageModel(1)
    data = open_file('kn_test.txt')
    lm.train(data)
    assert lm.laplace_evaluate(['text']) == math.log(3 / 12)
    assert lm.laplace_evaluate(['dog']) == math.log(1 / 12)
示例#31
0
def segment_noise(dataset, summary):
    chunk_dict = {}
    grammar_set = []

    rands = np.random.rand(10000)
    rand_idx = 0
    gidx = 0

    batch_size = 128

    file_dir = 'data/' + dataset + '/'
    model_file = 'model/%s/lm.model' % dataset
    dict_file = 'model/%s/lm.dict.p' % dataset
    train_file = 'data/%s/train.json' % dataset

    tokens_list, tags_list = chunk_text(train_file)
    token_dict = pickle.load(open(dict_file, 'rb'))

    word_size = len(token_dict)
    word_dim = 256
    hidden_dim = 512

    model = Model(word_size, word_dim, hidden_dim)
    model.cuda()
    if os.path.exists(model_file):
        best_point = torch.load(model_file)
        state_dict = best_point['state_dict']
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            temp = state_dict[k]
            if k.startswith('module.'):
                k = k[7:]
                new_state_dict[k] = temp
        model.load_state_dict(new_state_dict)
    model.eval()

    shuffle_indices = np.random.permutation(np.arange(len(tokens_list)))
    tokens_list = np.array(tokens_list)[shuffle_indices]
    tags_list = np.array(tags_list)[shuffle_indices]

    noised_data = []

    rev_token_dict = {token_dict[token]: token for token in token_dict}
    for _ in range(1):
        for idx in tqdm(range(0, len(tokens_list), batch_size)):
            tokens_batch = tokens_list[idx:idx + batch_size]
            tags_batch = tags_list[idx:idx + batch_size]

            probs_batch = []
            probs_indices_batch = []
            for tokens in tqdm(tokens_batch):
                if not check_sentence(
                    [rev_token_dict[token] for token in tokens]):
                    continue

                x_batch, x_mask = utils.pad([tokens])
                x_batch = to_tensor(x_batch)
                x_mask = to_tensor(x_mask).float()

                ps_batch = model(x_batch, x_mask, ps_only=True)
                ps_batch = F.softmax(ps_batch, dim=-1)
                ps_batch = list(ps_batch.cpu().detach().numpy())

                probs_sequence = []
                probs_indices_sequence = []
                for ps in ps_batch[0]:
                    probs, probs_indices = nuclear_filter(ps)
                    probs_sequence.append(probs)
                    probs_indices_sequence.append(probs_indices)

                probs_batch.append(probs_sequence)
                probs_indices_batch.append(probs_indices_sequence)

            chunk_dict = {}
            grammar_set = []

            chunks_batch = []
            ctags_batch = []
            for tokens, tags in zip(tokens_batch, tags_batch):
                chunks, ctags = split_to_chunks(tokens[1:-1], tags, chunk_dict,
                                                grammar_set)
                chunks_batch.append(chunks)
                ctags_batch.append(ctags)

            for chunk in chunk_dict:
                chunk_dict[chunk] = list(set(chunk_dict[chunk]))
            grammar_set = list(set(grammar_set))
            np.random.shuffle(grammar_set)

            ps_idx = 0
            for j, (tokens, chunks, ctags) in enumerate(
                    tqdm(zip(tokens_batch, chunks_batch, ctags_batch),
                         total=len(chunks_batch))):
                if not check_sentence(
                    [rev_token_dict[token] for token in tokens]):
                    continue

                lm_chunk_inputs = []

                probs = probs_batch[ps_idx]
                probs_indices = probs_indices_batch[ps_idx]
                ps_idx += 1
                if dataset == 'rotten':
                    N = 20
                else:
                    N = 8
                for _ in tqdm(range(N)):
                    try:
                        new_chunks = replace_tokens(chunks, probs,
                                                    probs_indices)

                        new_chunks, new_ctags = remove_chunks(
                            new_chunks, ctags)
                        lm_chunk_input = insert_chunks(new_chunks, new_ctags,
                                                       chunk_dict, grammar_set,
                                                       rands, rand_idx, gidx)
                        lm_chunk_input = ' '.join([
                            rev_token_dict[token] for token in lm_chunk_input
                        ])
                        lm_chunk_inputs.append(lm_chunk_input)
                    except:
                        pass

                inst = {}
                inst['summary'] = ' '.join(
                    [rev_token_dict[token] for token in tokens[1:-1]])
                inst['segment_reviews'] = lm_chunk_inputs

                noised_data.append(inst)

    return noised_data
示例#32
0
class LanguageModelTests(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        print("\LanguageModelTests starts")
        print("==========")

    @classmethod
    def tearDownClass(cls):
        print("==========")
        print("LanguageModelTests has ended")

    def setUp(self):
        self.lm = LanguageModel(3)
        self.token_sequences = [['the', 'cat', 'runs'], ['the', 'dog', 'runs']]
        self.lm.train(self.token_sequences)

    def test_get_ngrams(self):
        print("id: " + self.id())
        self.lm.n = 4
        input_tokens = ['the', 'cat', 'in', 'the', 'hat']
        result_ngrams = [
            (None, None, None, 'the'), (None, None, 'the', 'cat'),
            (None, 'the', 'cat', 'in'), ('the', 'cat', 'in', 'the'),
            ('cat', 'in', 'the', 'hat'), ('in', 'the', 'hat', None),
            ('the', 'hat', None, None), ('hat', None, None, None)
        ]
        self.assertEqual(self.lm.get_ngrams(input_tokens), result_ngrams)

    def test_train_vocabulary_and_counts(self):
        print("id: " + self.id())
        self.assertEqual(self.lm.vocabulary,
                         {None, 'the', 'cat', 'runs', 'dog'})

        result_counts = {
            (None, None): {
                'the': 2
            },
            (None, 'the'): {
                'cat': 1,
                'dog': 1
            },
            ('the', 'cat'): {
                'runs': 1
            },
            ('cat', 'runs'): {
                None: 1
            },
            ('runs', None): {
                None: 2
            },
            ('the', 'dog'): {
                'runs': 1
            },
            ('dog', 'runs'): {
                None: 1
            }
        }
        self.assertEqual(self.lm.counts, result_counts)

    def test_normalize(self):
        print("id: " + self.id())
        input_words = {'cat': 1, 'dog': 1}
        result_probabilities = {'cat': 0.5, 'dog': 0.5}
        self.assertEqual(self.lm.normalize(input_words), result_probabilities)

    def test_normalize_sum_probabilies(self):
        print("id: " + self.id())
        input_words = {'cat': 1, 'dog': 1}
        probabilities = self.lm.normalize(input_words)

        prob_sum = 0
        for key in probabilities:
            prob_sum += probabilities[key]
        self.assertEqual(prob_sum, 1)

    def test_predict_next(self):
        print("id: " + self.id())
        input_tokens = [None, "zero", None, 'the', 'dog']
        result_probabilities = {'runs': 1}
        self.assertEqual(self.lm.p_next(input_tokens), result_probabilities)

    def test_sample(self):
        print("id: " + self.id())
        input_probability_distribution = {'heads': 0.5, 'tails': 0.5}
        predicted_word = self.lm.sample(input_probability_distribution)[0]
        self.assertIn(predicted_word, input_probability_distribution)
示例#33
0
 def setUp(self):
     self.lm = LanguageModel(3)
     self.token_sequences = [['the', 'cat', 'runs'], ['the', 'dog', 'runs']]
     self.lm.train(self.token_sequences)
示例#34
0
 def test_create_existing(self):
     lm = LanguageModel('2503')
     self.assertTrue(lm.is_ready())
示例#35
0
def main(args):
	"""
	Main function of the program operates based on the argument provided.

	Train
		- Ask for ngram
		- Ask for training file path
		- Train language model
		- Save the trained model

	Generate
		- Load the saved model from pickle file
		- Ask for a beam search (y/n)
			- Ask Beam length
		- Print one generated sentence in terminal
		- Ask for number of sentences to be generated on file
		- Save the input number of sentences in a file (Default: new_shakespeare.txt)

	Perplexity
		- Load Pickle file
		- Ask the test set file path
		- Print perplexity value

	Common
		- Load pickle
		- Ask number of most common ngram
		- Print the most common ngram with their occurence number.

	"""
	if args['train']:
		if not args['--n']:
			ngram = input("Please enter n for n-gram (Default: 3)-\n")
			if not ngram:
				ngram=3
		else:
			ngram=args['--n']
		lm = LanguageModel(int(ngram))

		if not args['--path']:
			path = input("Please enter path of the file-\n")
		else:
			path = args['--path']
		lm.train(readFile(path))
		print("N-gram training completed")
		print("Saving the model")
		f = open('trained_model_ngram.pkl','wb')
		pickle.dump(lm, f)
		f.close()
		print("Model saved")

	if args['generate']:
		lm = loadPickle()

		if click.confirm('Do you want to generate with Beam search?', default=True):
			lm.beam_flag = True
			beam_size =input("Enter beam size (Default: 20)-\n")
			if not beam_size:
				lm.beam_width = beam_size
		else:
			lm.beam_flag = False
		print("Generating one sentence in terminal...")
		print(detokenize(lm.generate()))
		if not args['--lines']:
			noOfText =input("Enter number of generated text you want to save (Default: 10)-\n")
			if not noOfText:
				noOfText=10
		else:
			noOfText = args['--lines']
		generated = []
		for g in range(0, int(noOfText)):
			generated.append(detokenize(lm.generate()))

		with open('new_shakespeare.txt', 'w') as f:
			for g in generated:
				f.write("%s\n" % g)
		print("Sentence file generated in current folder")

	if args['perplexity']:
		lm = loadPickle()
		if not args['--path']:
			path = input("Please enter path of the test file-\n")
		else:
			path = args['--path']
		print("Perplexity for {}-gram is {}".format(lm.ngram,lm.perplexity(readFile(path))))

	if args['common']:
		lm = loadPickle()
		if args['--number']:
			number = args['--number']
		else:
			number = 5
		lm.count_common_ngram(int(number))
示例#36
0
import pickle
from lm import LanguageModel

train_filename = "train_sequence.pkl"
model_filename = "model.pkl"

dataset = pickle.load(open(train_filename, "rb"))

lm = LanguageModel(lidstone_param=3e-4)
lm.fit(dataset)

pickle.dump(lm, open(model_filename, "wb"))