示例#1
0
 def __init__(self, max_term_count=None, dict_path=None):
     if max_term_count:
         self.m_max_term_count = max_term_count
     else:
         self.m_max_term_count = default_max_term_count
     if not WordSeg.m_dict_loaded:
         if dict_path:
             WordSeg.m_dict_path = dict_path
         else:
             WordSeg.m_dict_path = default_dict_path
         print WordSeg.m_dict_path
         WordSeg.m_dict_handle = wordseg.scw_load_worddict(
             WordSeg.m_dict_path)
         if WordSeg.m_dict_handle:
             WordSeg.m_dict_loaded = True
     self.m_result_handle = wordseg.scw_create_out(self.m_max_term_count *
                                                   10)
     self.m_token_handle = wordseg.create_tokens(self.m_max_term_count)
     self.m_token_handle = wordseg.init_tokens(self.m_token_handle,
                                               self.m_max_term_count)
     self.m_mode['WPCOMP'] = wordseg.SCW_WPCOMP
     self.m_mode['BASIC'] = wordseg.SCW_BASIC
     self.m_mode['SUBPH'] = wordseg.SCW_SUBPH
     self.m_mode['NEWWORD'] = wordseg.SCW_NEWWORD
     self.m_mode['HUMAN'] = wordseg.SCW_HUMANNAME
     self.m_mode['BOOK'] = wordseg.SCW_BOOKNAME
     self.m_mode['DISAMB'] = wordseg.SCW_DISAMB
示例#2
0
    def __init__(self, dict_path):
        print >> sys.stderr, 'WordSegUtil constructed'
        self.MAX_TERM_CNT = 2048
        self.scw_worddict = wordseg.scw_load_worddict(
            os.path.join(dict_path, 'wordseg/chinese_gbk'))
        self.scw_tagdict = postag.tag_create(os.path.join(dict_path, 'postag'))
        self.scw_out = wordseg.scw_create_out(self.MAX_TERM_CNT * 10)

        # token
        self.tokens = wordseg.create_tokens(self.MAX_TERM_CNT)
        self.tokens = wordseg.init_tokens(self.tokens, self.MAX_TERM_CNT)
示例#3
0
 def __init__(self, segdict_conf_path, segdict_path):
     '''
     @brief 构造函数
     @param segdict_conf_path 配置文件位置
     @param segdict_path 切词库位置
     @param tagdict_path 词性库位置
     '''
     self.max_term_count = 512
     try:
         self.conf_handle = wordseg.scw_load_conf(segdict_conf_path)
         self.dict_handle = wordseg.scw_load_worddict(segdict_path)
         self.result_handle = wordseg.scw_create_out(self.max_term_count*10)
         self.token_handle = wordseg.create_tokens(self.max_term_count)
         self.token_handle = wordseg.init_tokens(self.token_handle, self.max_term_count)
     except Exception as e:
         log.warning("SegDict Load Error! error=%s",e);
示例#4
0
    def __init__(self):
        print >> sys.stderr, "WordSegUtil constructed"
        self.maxTermCount = 2048
        dict_ab_url = (os.path.dirname(os.path.abspath(__file__))) + "/dict"
        #print dict_ab_url
        # 加载词典
        #print os.path.join(dict_ab_url, "worddict")
        self.hWordDict = wordseg.scw_load_worddict(
            os.path.join(dict_ab_url, "worddict"))
        self.hTagDict = postag.tag_create(os.path.join(dict_ab_url, "tagdict"))
        # hNerDict  = wordner.ner_dict_load(os.path.join(dict_ab_url, "nerdict"))
        self.hRankDict = wordrank.wdr_create(
            os.path.join(dict_ab_url, "rankdict"))

        self.hScwOut = wordseg.scw_create_out(self.maxTermCount * 10)
        # hNerOut = wordner.ner_out_create(hNerDict, self.maxTermCount)
        self.hRanks = wordrank.create_ranks(self.maxTermCount)

        # token
        self.hTokens = wordseg.create_tokens(self.maxTermCount)
        self.hTokens = wordseg.init_tokens(self.hTokens, self.maxTermCount)

        # 专名过滤
        self.nerWhiteTags = set([
            "PER",  # 人名
            #"LOC",          # 地名
            #"ORG",          # 机构
            #"SFT",          # 软件
            "GME",  # 游戏
            "SNG",  # 歌曲
            #"NVL",          # 小说
            "VDO",  # 视频
            "BRD",  # 品牌
            "CTN",  # 动漫
            "VDO_MVE",  # 电影
            "VDO_TV",  # 电视剧
            "VDO_TVSHOW"  # 电视节目
        ])
示例#5
0
 def __init__(self, max_term_count = None, dict_path = None):
     if max_term_count:
         self.m_max_term_count = max_term_count
     else:
         self.m_max_term_count = default_max_term_count
     if not WordSeg.m_dict_loaded:
         if dict_path:
             WordSeg.m_dict_path = dict_path
         else:
             WordSeg.m_dict_path = default_dict_path
         print WordSeg.m_dict_path
         WordSeg.m_dict_handle = wordseg.scw_load_worddict(WordSeg.m_dict_path)
         if WordSeg.m_dict_handle:
             WordSeg.m_dict_loaded = True
     self.m_result_handle = wordseg.scw_create_out(self.m_max_term_count*10)
     self.m_token_handle = wordseg.create_tokens(self.m_max_term_count)
     self.m_token_handle = wordseg.init_tokens(self.m_token_handle, self.m_max_term_count)
     self.m_mode['WPCOMP'] = wordseg.SCW_WPCOMP
     self.m_mode['BASIC'] = wordseg.SCW_BASIC
     self.m_mode['SUBPH'] = wordseg.SCW_SUBPH
     self.m_mode['NEWWORD'] = wordseg.SCW_NEWWORD
     self.m_mode['HUMAN'] = wordseg.SCW_HUMANNAME
     self.m_mode['BOOK'] = wordseg.SCW_BOOKNAME
     self.m_mode['DISAMB'] = wordseg.SCW_DISAMB
示例#6
0
    line = line.strip("\n").split("\t")
    query = line[2]
    title = line[5]
    if query == "" or title == "":
        continue
    query = urllib.unquote(query)
    title = urllib.unquote(title)
    label = int(line[7])
    pos.append([query, title, label])

strip_chars = [".", ",", "-", "_", ":"]
final = []

MAX_TERM_COUNT = 1024

dict_handle = wordseg.scw_load_worddict("./dict/wordseg_dict/")
result_handle = wordseg.scw_create_out(MAX_TERM_COUNT)
token_handle = wordseg.create_tokens(MAX_TERM_COUNT)
token_handle = wordseg.init_tokens(token_handle, MAX_TERM_COUNT)

for query, title, label in pos:

    for char in strip_chars:
        query = query.strip(char)
        title = title.strip(char)

    query_title = []
    for line in [query, title]:
        wordseg.scw_segment_words(dict_handle, result_handle, line, 1)
        token_count = wordseg.scw_get_token_1(result_handle,
                                              wordseg.SCW_WPCOMP, token_handle,
示例#7
0
    line = line.strip("\n").split("\t")
    query = line[2]
    title = line[5]
    if query == "" or title == "":
        continue
    query = urllib.unquote(query)
    title = urllib.unquote(title)
    label = int(line[7])
    pos.append([query, title, label])

strip_chars = [".", ",", "-", "_", ":"]
final = []

MAX_TERM_COUNT = 1024

dict_handle = wordseg.scw_load_worddict("./dict")
result_handle = wordseg.scw_create_out(MAX_TERM_COUNT)
token_handle = wordseg.create_tokens(MAX_TERM_COUNT)
token_handle = wordseg.init_tokens(token_handle, MAX_TERM_COUNT)

for query, title, label in pos:

    for char in strip_chars:
        query = query.strip(char)
        title = title.strip(char)

    query_title = []
    for line in [query, title]:
        wordseg.scw_segment_words(dict_handle, result_handle, line, 1)
        token_count = wordseg.scw_get_token_1(result_handle,
                                              wordseg.SCW_WPCOMP, token_handle,