Пример #1
0
 def load_model(cls):
     """
     模型加载
     """
     config = get_config()
     model_path = '{}.bin'.format(config.get('train', 'model_path'))
     if os.path.exists(model_path):
         cls.__model = ft.load_model(model_path)
Пример #2
0
 def get_corpus_path(cls, sample=None):
     """
     获取语料路径
     """
     config = get_config()
     if sample:
         corpus_path = config.get('train', 'sample_corpus_path')
     else:
         corpus_path = config.get('train', 'seg_corpus_path')
     return corpus_path
Пример #3
0
def train_model():
    """
    模型训练
    """
    config = get_config()
    dictionary = get_dictionary()
    input_file = dictionary.get_corpus_path()
    output = config.get('train', 'model_path')
    model = get_model()
    model.train(input_file, output)
Пример #4
0
def test_model():
    """
    模型测试
    """
    config = get_config()
    test_file_path = config.get('test', 'test_seg_corpus_path')
    model = get_model()
    result = model.test(test_file_path)
    print('precision:', result.precision)
    print('recall:', result.recall)
    print('examples:', result.nexamples)
Пример #5
0
 def __load_user_dict(cls):
     """
     加载用户词典
     """
     config = get_config()
     user_dict_path = config.get('train', 'user_dict_path')
     gr = gzip.open(user_dict_path)
     lines = gr.readlines()
     words = set([line.strip() for line in lines if line.strip()])
     user_dict = ['{} {} n'.format(word, len(word)*1000) for word in words]
     buff_file = StringIO('\n'.join(user_dict))
     jieba.load_userdict(buff_file)
     cls._jieba = jieba
     gr.close()
Пример #6
0
 def cut(cls, **kwargs):
     """
     语料分词
     """
     config = get_config()
     kwargs.setdefault('corpus_path', config.get('train', 'corpus_path'))
     kwargs.setdefault('seg_corpus_path', config.get('train', 'seg_corpus_path'))
     kwargs.setdefault('sample_corpus_path', config.get('train', 'sample_corpus_path'))
     kwargs.setdefault('vocabs_path', config.get('train', 'vocabs_path'))
     kwargs.setdefault('sample', config.get('train', 'sample'))
     kwargs.setdefault('sentence', '')
     if not cls._jieba:
         cls.__load_user_dict()
     if not kwargs.get('sentence'):
         cls.__cut_corpus(**kwargs)
     else:
         return cls.__cut_sentence(**kwargs)
Пример #7
0
    def train(cls, input_file, output, **kwargs):
        """
        模型训练

        * input_file             training file path (required)
        * output                 output file path (required)
        * label_prefix           label prefix ['__label__']
        * lr                     learning rate [0.1]
        * lr_update_rate         change the rate of updates for the learning rate [100]
        * dim                    size of word vectors [100]
        * ws                     size of the context window [5]
        * epoch                  number of epochs [5]
        * min_count              minimal number of word occurences [1]
        * neg                    number of negatives sampled [5]
        * word_ngrams            max length of word ngram [1]
        * loss                   loss function {ns, hs, softmax} [softmax]
        * bucket                 number of buckets [0]
        * minn                   min length of char ngram [0]
        * maxn                   max length of char ngram [0]
        * thread                 number of threads [12]
        * t                      sampling threshold [0.0001]
        * silent                 disable the log output from the C++ extension [1]
        * encoding               specify input_file encoding [utf-8]
        * pretrained_vectors     pretrained word vectors (.vec file) for supervised learning []
        """
        config = get_config()
        kwargs.setdefault('lr', config.get('model', 'lr'))
        kwargs.setdefault('lr_update_rate', config.get('model', 'lr_update_rate'))
        kwargs.setdefault('dim', config.get('model', 'dim'))
        kwargs.setdefault('ws', config.get('model', 'ws'))
        kwargs.setdefault('epoch', config.get('model', 'epoch'))
        kwargs.setdefault('word_ngrams', config.get('model', 'word_ngrams'))
        kwargs.setdefault('loss', config.get('model', 'loss'))
        kwargs.setdefault('bucket', config.get('model', 'bucket'))
        kwargs.setdefault('thread', config.get('model', 'thread'))
        kwargs.setdefault('silent', config.get('model', 'silent'))
        cls.__model = ft.supervised(input_file, output, **kwargs)
        return cls.__model