Exemplo n.º 1
0
def extract_clinic_term(str_origin, separator='||'):
    """
    函数主要用于在特定位置加入分隔符(||)
    a. 连续数字被其他字符分隔 --> ||
    b. ; --> ||
    c. ? --> ||
    :param str_origin 待处理的字符串
    :param separator 分隔符
    :return:
    """
    logger = initiate_log(logger_name=Constants.log_name_default)
    logger.debug('原始的字符串:%s', str_origin)
    pattern = re.compile(r'(\d+)(\D+)')
    data = pattern.findall(str_origin)
    nums = [int(v[0]) for v in data if len(data) > 0]
    nums_new = is_element_increment(nums)
    logger.debug('连续递增的数字: %s', nums_new)
    if nums_new:
        for i in nums_new:
            regex = ''
            for j in i:
                regex += '({num})(\D+)'.format(num=j)
            pattern = re.compile(r'{regex}'.format(regex=regex))
            str_origin = pattern.sub(regex_replace, str_origin)
    pattern = re.compile(r'(;|\?)+')
    str_origin = pattern.sub(r'{separator}'.format(separator=separator),
                             str_origin)
    pattern = re.compile(r'\|\|{2,}')  # 去除重复的 ||
    return pattern.sub(separator, str_origin)
Exemplo n.º 2
0
 def __init__(self):
     """
     初始化 MySQL 连接参数
     """
     logger = initiate_log(__name__)
     try:
         self.config = {
             'host': db_config.host,
             'database': db_config.db,
             'user': db_config.user,
             'password': db_config.passwd,
             'raise_on_warnings': False,
             'charset': 'utf8',
         }
         self.cnx = connect(**self.config)
     except Error as err:
         if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
             err_msg = "Something is wrong with your user name or password"
             logger.error(err_msg)
         elif err.errno == errorcode.ER_BAD_DB_ERROR:
             err_msg = "Database does not exist"
             logger.error(err_msg)
         else:
             err_msg = err.msg
             logger.error(err_msg)
         raise DpException(ErrorConstants.ec_sys_error,
                           ErrorConstants.error_code_message
                           .get(ErrorConstants.ec_sys_error) + err_msg)
     finally:
         self.cursor = self.cnx.cursor(dictionary=True)
         logger.info("database connect success.")
Exemplo n.º 3
0
def main():
    logger = initiate_log(logger_name=Constants.log_name_brief)
    separator = '||'
    table_name_wip_cterm = 'wip_cterm'
    table_name_wip_cterm_element = 'wip_cterm_element'
    connector = MySQLConnector()
    sql = """
        select * from {table_name} where origin_cd;
    """.format(table_name=table_name_wip_cterm)
    data = connector.query_data(sql=sql)

    for i, d in enumerate(data[:]):
        for k, v in d.items():
            data[i]
            if k == 'str_new':
                str_ret = extract_clinic_term(v)
                data[i]['str_ret'] = str_ret
    '''
    构造 sql
    '''
    length = len(data)
    sql_value_wip_cterm = sql_value_wip_cterm_element = ''
    for i, v in enumerate(data[:length]):
        str_ret = v['str_ret'].replace('"', '\\"')
        sql_value_wip_cterm += '("{origin_cd}", "{str_ret}")'.format(
            origin_cd=v['origin_cd'], str_ret=str_ret)
        if i != length - 1:
            sql_value_wip_cterm += ', '
        for j, k in enumerate(str_ret.split(separator)):
            sql_value_wip_cterm_element += '("{origin_cd}", "{term}")'.format(
                origin_cd=v['origin_cd'], term=k)
            if j != len(str_ret.split(separator)) - 1:
                sql_value_wip_cterm_element += ', '
        if i != length - 1:
            sql_value_wip_cterm_element += ', '
    '''
    wip_cterm 表插入数据
    '''
    logger.info('begin insert data into wip_cterm')
    sql = """
        insert into {table_name}(`origin_cd`, `str_ret`) values{sql_value}
        on DUPLICATE key update str_ret = values(`str_ret`);
    """.format(table_name=table_name_wip_cterm, sql_value=sql_value_wip_cterm)
    connector = MySQLConnector()
    connector.manipulate_data(sql)
    logger.info('end insert data into wip_cterm')
    '''
    wip_cterm_element 表插入数据
    '''
    logger.info('begin insert data into wip_cterm_element')
    sql = """
        insert ignore into {TABLE_NAME}(`origin_cd`, `term`) values{sql_value}
    """.format(TABLE_NAME=table_name_wip_cterm_element,
               sql_value=sql_value_wip_cterm_element)
    connector = MySQLConnector()
    connector.manipulate_data(sql)
    logger.info('end insert data into wip_cterm_element')
Exemplo n.º 4
0
class TestNltk(unittest.TestCase):
    """
    nltk: natural language toolkit
    """
    logger = initiate_log()

    def setUp(self):
        pass
        # nltk.download("all")

    def test_simple(self):
        sentence = """我最爱吃的东西是凤梨"""
        tokens = jieba.lcut(sentence)
        self.logger.debug(','.join(tokens))
        jieba.add_word('爱吃')
        references_1 = jieba.lcut('我爱吃的东西是凤梨啊')
        self.logger.debug('references 1: %s', references_1)
        references_2 = jieba.lcut('他不爱吃苹果')
        self.logger.debug('references 2: %s', references_2)
        references_3 = jieba.lcut('我们都是中国人地地道道')
        self.logger.debug('references 3: %s', references_3)
        score = nltk.bleu([references_1], tokens)
        self.logger.debug('bleu score is %s', score)
        pass
Exemplo n.º 5
0
class Jieba(unittest.TestCase):
    """
    jieba 中文分词
    """
    logger = initiate_log()

    # def __init__(self):
    #     self.logger = initiate_log()

    def get_split_line(self, topic):
        return '=' * 30 + topic + '=' * 30

    def test_split_mode(self):
        """
        分词模式
        """
        topic = '分词模式'
        split_line = self.get_split_line(topic)
        self.logger.debug(split_line)
        """
        全模式:把句子中所有的金额已成词的词语都扫描出来,速度非常快,但不能解决歧义
        """
        seg_list = jieba.cut("中华人民共和国", cut_all=True)
        self.logger.info("{topic}_全模式: {msg}".format(topic=topic,
                                                     msg="/ ".join(seg_list)))
        """
        精确模式:试图将句子最精确的切开,适合文本分析
        """
        seg_list = jieba.cut("中华人民共和国", cut_all=False)
        self.logger.info("{topic}_精确模式(默认): {msg}".format(
            topic=topic, msg="/".join(seg_list)))
        """
        搜索模式:在精确模式的基础上,对长词再次切分,提高召回率,适合用于搜索引擎分词
        """
        seg_list = jieba.cut_for_search("中华人民共和国")
        self.logger.info("{topic}_搜索模式: {msg}".format(topic=topic,
                                                      msg=", ".join(seg_list)))

    def test_user_dict(self):
        """
        2. 自定义词典
        """
        topic = '添加自定义词典'
        split_line = self.get_split_line(topic)
        self.logger.debug(split_line)
        test_sent = """李小福是创新办主任也是云计算方面的专家; 什么是八一双鹿\n
        例如我输入一个带“韩玉赏鉴”的标题,在自定义词库中也增加了此词为N类\n
        台中」正確應該不會被切開。mac上可分出「石墨烯」;此時又可以分出來凱特琳了。
        """
        words = jieba.cut(test_sent)
        self.logger.debug('{topic}_原始: {msg}'.format(topic=topic,
                                                     msg='/'.join(words)))
        """
                调整词典:动态修改词典
                """
        userdict_path = os.path.dirname(__file__) + "/jieba_dict/dict.txt"
        jieba.add_word('石墨烯')
        jieba.add_word('凱特琳')
        jieba.del_word('自定义词')
        jieba.load_userdict(userdict_path)
        words = jieba.cut(test_sent)
        self.logger.debug('{topic}_自定义字典分词:{msg}'.format(topic=topic,
                                                         msg='/'.join(words)))

        self.logger.debug('test split words' + "=" * 40)
        terms = jieba.cut('easy_install is great')
        self.logger.debug('{topic}_字典分词: {msg}'.format(topic=topic,
                                                       msg='/'.join(terms)))
        jieba.del_word('easy_install')
        terms = jieba.cut('easy_install is great')
        self.logger.debug('{topic}_删除单词: {msg}'.format(topic=topic,
                                                       msg='/'.join(terms)))
        terms = jieba.cut('python 的正则表达式是好用的')
        self.logger.debug('{topic}_单词: {msg}'.format(topic=topic,
                                                     msg='/'.join(terms)))

        self.logger.debug('test frequency tune' + "=" * 40)
        word = '这里中将应该被切开'
        self.logger.debug('{topic}_调低词频之前: {msg}'.format(topic=topic,
                                                         msg='/'.join(
                                                             jieba.cut(word))))
        self.logger.debug('{topic}_调整词频: {msg}'.format(
            topic=topic,
            msg='before: {before}, after: {after}'.format(
                before=jieba.get_FREQ('中将'),
                after=jieba.suggest_freq(('中', '将'), True))))
        self.logger.debug('{topic}_调低词频之后: {msg}'.format(
            topic=topic, msg='/'.join(jieba.cut(word, HMM=False))))

        jieba.del_word('台中')
        word = '[台中]正确应该不会被切开'
        self.logger.debug('{topic}_调高词频之前: {msg}'.format(topic=topic,
                                                         msg='/'.join(
                                                             jieba.cut(word))))
        self.logger.debug('{topic}_调整词频: {msg}'.format(
            topic=topic,
            msg='before: {before}, after: {after}'.format(
                before=jieba.get_FREQ('台中'),
                after=jieba.suggest_freq('台中', True))))
        self.logger.debug('{topic}_调高词频之后: {msg}'.format(
            topic=topic, msg='/'.join(jieba.cut(word, HMM=False))))

    def test_extract_tags(self):
        """
        3. 关键词抽取
        """
        topic = '关键词抽取'
        split_line = self.get_split_line(topic=topic)
        self.logger.info(split_line)

        term = '我们时人中国的可是is of super man'
        res = analyse.extract_tags(term)
        self.logger.info('{topic}_标准抽取: {term} -> {msg}'.format(topic=topic,
                                                                term=term,
                                                                msg=res))

        #  TODO: 自定义语料库运行有问题
        user_idf_path = os.path.dirname(__file__) + '/jieba_dict/idf.txt.big'
        analyse.set_idf_path(user_idf_path)
        res = analyse.extract_tags(term)
        self.logger.info('{topic}_自定义逆向文件频率: {term} -> {msg}'.format(
            topic=topic, term=term, msg=res))

    def test_tokenize(self):
        """
        Tokenize: 返回词语在原文的位置
        """
        topic = '返回词语在原文的位置'
        split_line = self.get_split_line(topic)
        self.logger.debug(split_line)

        term = '我们都是炎黄子孙'
        res = jieba.tokenize(term)
        for tk in res:
            self.logger.debug('{topic}_默认模式_{term}: {msg}'.format(
                topic=topic,
                term=term,
                msg="word %s\t\t start: %d \t\t end:%d" %
                (tk[0], tk[1], tk[2])))

        res = jieba.tokenize(term, mode='search')
        for tk in res:
            self.logger.debug('{topic}_搜索模式_{term}: {msg}'.format(
                topic=topic,
                term=term,
                msg="word %s\t\t start: %d \t\t end:%d" %
                (tk[0], tk[1], tk[2])))
Exemplo n.º 6
0
# -*- coding: utf-8 -*-

from kazoo.client import KazooClient
from dataProcess.dap.modules.log_initiate import initiate_log
from kazoo.client import KazooState
from kazoo.protocol.states import WatchedEvent
from kazoo.handlers.threading import KazooTimeoutError
import unittest
import json

logger = initiate_log()

hosts = "192.168.150.250:2181"
node = "dcm/beta"
nodeMysql = node + "/mysql"


def connection_listener(state):
    if state == KazooState.SUSPENDED:
        # handle being disconnected from zookeeper
        logger.error("disconnect from zookeeper")


def watcher_default(event: WatchedEvent):
    """
    default watcher: A watch function passed to get() or exists()
    will be called when the data on the node changes
    or the node itself is deleted.
    and it is one-time watch events
    :param event:
    :return:
Exemplo n.º 7
0
# -*- coding: utf-8 -*-
from kazoo.client import KazooClient
from kazoo.client import KazooState
from kazoo.protocol.states import WatchedEvent
from kazoo.handlers.threading import KazooTimeoutError
from dataProcess.dap.confs.zk import zk_hosts
from dataProcess.dap.modules.log_initiate import initiate_log
from dataProcess.dap.modules.dp_exception import DpException
from dataProcess.dap.modules.constants import ErrorConstants, Constants
import json
import dataProcess.dap.confs.db_config as db_config

logger = initiate_log(__name__)


def connection_listener(state):
    if state == KazooState.SUSPENDED:
        # handle being disconnected from zookeeper
        err_msg = "disconnect from zookeeper"
        logger.error(err_msg)
        raise DpException(
            ErrorConstants.ec_sys_error,
            ErrorConstants.error_code_message.get(ErrorConstants.ec_sys_error)
            + err_msg)


class ZkConfig:
    def __init__(self):
        self.zk = KazooClient(hosts=zk_hosts)
        try:
            self.zk.start()