def encode(self, seq_list): try: _s_embedded, _s_lengths = Helper.get_batch(seq_list) feed_dict = { self.s_embedded: _s_embedded, self.s_lengths: _s_lengths} s_embeddings = self.sess.run(self.s_embeddings, feed_dict = feed_dict) return s_embeddings except Exception as e: logger.get().debug('seq_length=%s, errmsg=%s', len(seq_list), e) def test(): Helper.init() with codecs.open('./data/test.txt', 'r', 'utf-8') as in_f: corpus = [line.strip('\n') for line in in_f.readlines()] corpus = [['<s>'] + [word for word in NLPUtil.tokenize_via_jieba(sent) if word in Helper._word2vec] + ['</s>'] for sent in corpus] s_encoder = Encoder() s_embeddings = s_encoder.encode(corpus) print s_embeddings.shape print s_embeddings.dtype print s_embeddings[0] if __name__ == '__main__': logger.start('./log/encode.log', name = __name__, level = 'DEBUG') test()
fw = codecs.open(output, 'w', 'utf-8') with codecs.open(input, 'r', 'utf-8') as fr: for line in fr: lines = line.strip().split('\t') lines[2] = str(dic[lines[2]]) fw.write('\t'.join(lines) + '\n') fw.close() def dump_word_embeddings(word2id): #import random as np emb_size = 300 vocab_size = len(word2id) word2vec = Word2Vec.load('../data2/word2vec.model') embeddings = np.random.randn(vocab_size, emb_size) for word, idx in word2id.items(): if word in word2vec: embeddings[idx, :] = word2vec[word] else: embeddings[idx, :] = np.random.randn(emb_size) print(embeddings.shape) np.save('../data2/word2vec_new.model', embeddings) if __name__ == '__main__': g_log_inst.start('../log/reader.log', __name__, 'DEBUG') save_vocab() #data_process('../data/train.txt', '../data/trainset.txt') #data_process('../data/test.txt', '../data/testset.txt')
for k, v in cls._replace_pattern_cfg.items(): if v.match(token): token = k break if '{[' not in token: return token for item in cls._wordseg_pattern_cfg: token = item.sub('', token) return token except Exception as e: logger.get().warn('token=%s, errmsg=%s' % (token, e)) return token if '__main__' == __name__: logger.start('./log/test.log', __name__, 'DEBUG') in_fpath = './data/question.raw' out_fpath = './data/question.raw.gbk' #NLPUtil.conv_fenc_u8_to_gbk(in_fpath, out_fpath) in_fpath = './data/question.seg.u8' out_fpath = './data/vocab.txt' #NLPUtil.stat_token_freq(in_fpath, out_fpath) msgs = [ u'携带乙肝病毒可以母乳喂养吗', u'做糖筛是不是又要打B超哦', u'这个crp偏高是怎么回事, 12mg, 12ml, 12mml, 11kg, 11kcal, 11k, 11kj', u'b 你好 乳头内陷要怎么母乳', ]
save_path = saver.save(session, '%s/model.ckpt' % (ckpt_dir)) g_log_inst.get().info('[model] save success, ckpt_path=%s' % (save_path)) # test the accuracy test_perplexity, accuracy, domain_accuracy = run_epoch( session, mtest, test_data, tf.no_op(), debug=True, verbose=True, id2word_dict=id2word_dict, dsl_converter=config.converter) g_log_inst.get().info('Test: perplexity=%.3f, accuracy=%s' % (test_perplexity, accuracy)) # acc compute ''' for idx, domain_accu in enumerate(domain_accuracy): g_log_inst.get().info('Domain: %s, precision: %.3f, recall: %.3f' % ( config.converter.label2domain[idx], domain_accuracy[idx][0] / float(domain_accuracy[idx][1]), domain_accuracy[idx][2] / float(domain_accuracy[idx][3]))) ''' g_log_inst.get().info('bilstm_attention training finished') if __name__ == '__main__': g_log_inst.start('../log/train.log', __name__, 'DEBUG') tf.app.run()