Пример #1
0
def load_doc_list():
    pynlpir.open()
    doc_list = os.listdir(SOURCE_DOC_DIR_PATH)
    segment_list = []
    for doc in doc_list:
        fr = codecs.open(SOURCE_DOC_DIR_PATH + doc, 'r', 'utf-8')
        line_list = fr.read()
        fr.close()
        '''
        line_list = line_list.split(NEW_LINE)
        line_list.pop()
        # seg_str = ''
        for i in range(len(line_list)):
            segment = pynlpir.segment(line_list[i], pos_tagging=False)
            seg_str = ''
            for seg in segment:
                seg_str += seg + ' '
            line_list[i] = seg_str.strip()
        # segment_list.append(' '.join(line_list))
        temp_str = ' '.join(line_list)
        '''
        key_word_list = pynlpir.get_key_words(line_list, max_words=10, weighted=True)
        for key_word in key_word_list:
            print(key_word[0], '\t', key_word[1])
        pynlpir.close()
        exit(0)
Пример #2
0
def get_key_words():
    s = ''
    max_words = MAX_WORDS_DEFAULT
    max_hot_words = MAX_HOT_WORDS_DEFAULT
    update_hot_word = UPDATE_HOT_WORD_DEFAULT
    # get doc
    if request.method == 'POST':
        s = request.form.get('s', type=str, default='')
        update_hot_word = request.form.get('update_hot_word', type=str, default=UPDATE_HOT_WORD_DEFAULT) # 是否更新hot_word表
        try:
            max_words = request.form.get('max_words', type=str, default=MAX_WORDS_DEFAULT)
            if max_words != '': # 有max_words参数(可能是默认值'3')
                print('[POST] max_words yes')
                max_words = int(max_words.strip())
                print('\tmax_words =', max_words)
            else:
                max_words = MAX_WORDS_DEFAULT
                print('[POST] max_words no')
        except: # max_words参数处理异常,设置默认值3
            max_words = MAX_WORDS_DEFAULT
        try:
            max_hot_words = request.form.get('max_hot_words', type=str, default=MAX_HOT_WORDS_DEFAULT)
            if max_hot_words != '':
                max_hot_words = int(max_hot_words.strip())
            else:
                max_hot_words = MAX_HOT_WORDS_DEFAULT
        except:
            max_hot_words = MAX_HOT_WORDS_DEFAULT
    elif request.method == 'GET':
        s = request.args.get('s')
        update_hot_word = request.args.get('update_hot_word')
        if update_hot_word != 'False':
            update_hot_word = 'True'
        try:
            max_words = int(request.args.get('max_words').strip())
        except:
            max_words = MAX_WORDS_DEFAULT
        try:
            max_hot_words = int(request.args.get('max_hot_words').strip())
        except:
            max_hot_words = MAX_HOT_WORDS_DEFAULT
    # get key words
    if s == '': # 文章内容为空,不分析
        return 'null'
    else: # 分析关键词
        pynlpir.open()
        key_word_list = pynlpir.get_key_words(s, max_words=max_words, weighted=False)
        # temp_str = ''
        for i in range(len(key_word_list)):
            key_word_list[i] = key_word_list[i]
        pynlpir.close()
        if update_hot_word == 'True':
            # 新开一个线程,更新数据库
            print('[update_hot_word] True')
            t = threading.Thread(target=db_helper.update_tables, args=(','.join(key_word_list), max_hot_words))
            t.setDaemon(True)
            t.start()
        else:
            print('[update_hot_word] False')
        return ','.join(key_word_list)
Пример #3
0
def separateWordFromFile(fileName):
	pynlpir.open()
	file = open(fileName,'r')
	lines = file.readlines()
	i = 0
	allSegmentResult = []
	#print type(s)
	label = []
	for line in lines:
		i = i+1
		textsegment = line
		if textsegment == "\n":
			print "skip"
			continue
		##note:
		'''   gbk 转 utf-8时,    
		   gbk --> unicode --> utf-8
           分解为两个步骤,
                   1.    gbk --> unicode
                             python 语法:你的字符串.decode("gbk")
                   2.    unicode --> utf-8
                            python 语法:你的字符串.decode("gbk").encode("utf-8")
		'''

		segmentResult = pynlpir.segment(textsegment,pos_tagging=True)
		newSegmentResult = removePunctuation(segmentResult)
		allSegmentResult.append(newSegmentResult)

	print len(allSegmentResult)
	file.close()
	pynlpir.close()
	#print label
	return allSegmentResult
Пример #4
0
 def segment(self, sentence):  #分词
     pynlpir.open(license_code=")VhTW_9s02tDm")
     list = pynlpir.segment(sentence)
     wordList = []
     for res in list:
         wordList.append(res[0])
     return wordList
Пример #5
0
def wordSegmenter(sentence='', pathOfStopWords=''):
    """
    将传入的句子分词并去除停用词
    :param sentence:         传入的句子
    :param pathOfStopWords:  停用词的路径
    :return:                 分词并去除停用词后由空格分隔的字符串
    """
    #打开分词器
    pynlpir.open()
    #分词
    seg_list = []
    for seg in pynlpir.segment(sentence):
        seg_list.append(seg[0])
    #去除停用词
    resultWords = []
    if pathOfStopWords == '':  #没指定停用词就使用默认的停用词
        pathOfStopWords = path.join(ROOT, STOP_WORDS)
    f_stop = open(pathOfStopWords, 'rt', encoding='utf-8')
    try:
        f_stop_text = f_stop.read()
    finally:
        f_stop.close()
    f_stop_words = f_stop_text.split("\n")
    for seg in seg_list:
        seg = seg.strip()
        if re.match(r'[a-zA-Z0-9]+', seg):  #去掉英文以及数字
            continue
        if len(seg) > 0 and (seg not in f_stop_words):
            resultWords.append(seg)
    return " ".join(resultWords)
def GetKeyWorld(filePath):  #使用PYNLPIR getkeywokld来实现
    #filePath='/home/yuanzhu/Desktop/NewsData/20190603/20190603419.json'
    try:
        pr.open()
        #filePath='/home/yuanzhu/Desktop/NewsData/20190501/20190501181.json'
        dicNews = GetDictFromJsonFile(filePath)
        content = dicNews['content']
        # segs=pr.segment(content)
        # for seg in segs:
        #     print(seg)
        tupkeywords = pr.get_key_words(
            content, weighted=True)  #使用TF-IDF算法提取关键词(貌似还挺有效果)
        keywords = []
        for i, w in enumerate(tupkeywords):
            keywords.append(w[0])
            if i == 9:
                break
            i += 1
    except Exception as e:
        strLogErr = 'Get  {}  keyworld error :{}'.format(filePath, e)
        print(strLogErr)
        return None
    print("FilePath=", filePath)
    print('获取热点:', keywords)
    return keywords
Пример #7
0
	def __init__(self):
		jieba.initialize()
		self.ltpseg = pyltp.Segmentor()
		self.ltpseg.load('model/ltp_data_v3.4.0/cws.model')
		jiagu.init()
		self.thu1 = thulac.thulac(seg_only=True)
		pynlpir.open()
Пример #8
0
    def train(self):
        # df_table = {"valid": {"science": 35, "physics": 34, "robot": 57}, "invalid": {"fat": 30, "large": 34, "cheap": 55}}
        # The number of articles containing "science", "physics" or "robot"
        # prior_table = {"valid": 183, "invalid": 244}
        pynlpir.open()
        prior_table = {ele: 0 for ele in self.category_list}
        posterior_table = {ele: dict() for ele in self.category_list}

        i = 0
        for sample in self.training_set_material:
            buffer = sample.split("\t")
            text = buffer[0]
            seg_words = pynlpir.segment(text, pos_tagging=False)
            words_set = set(seg_words)
            try:
                label = buffer[1]
            except:
                print("Line " + str(i) + "in training set corrupted")
                continue
            prior_table[label] += 1
            for word in words_set:  # all words in the text
                if word in posterior_table[label].keys():
                    posterior_table[label][word] += 1  # posterior count +1 when this word already exists in posterior
                else:
                    posterior_table[label][
                        word
                    ] = 1  # posterior count assigned to 1 when this word does exist in posterior yet
            i += 1
        return prior_table, posterior_table
def preprocess(filename):
    f_save = open('data/char_test.txt', 'w', encoding='utf-8')
    pynlpir.open()
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            lst = line.rstrip().split(' ')
            for item in lst:
                c, t = item.split('/')
                if t == 'o':
                    c = pynlpir.segment(c, pos_tagging=False)
                    for i, x in enumerate(c):
                        f_save.write(x + ' ' + 'O' + '\n')
                elif t == 'ns':
                    c = pynlpir.segment(c, pos_tagging=False)
                    for i, x in enumerate(c):
                        if i == 0:
                            f_save.write(x + ' ' + 'B-LOC' + '\n')
                        else:
                            f_save.write(x + ' ' + 'I-LOC' + '\n')
                elif t == 'nt':
                    c = pynlpir.segment(c, pos_tagging=False)
                    for i, x in enumerate(c):
                        if i == 0:
                            f_save.write(x + ' ' + 'B-ORG' + '\n')
                        else:
                            f_save.write(x + ' ' + 'I-ORG' + '\n')
                elif t == 'nr':
                    c = pynlpir.segment(c, pos_tagging=False)
                    for i, x in enumerate(c):
                        if i == 0:
                            f_save.write(x + ' ' + 'B-PER' + '\n')
                        else:
                            f_save.write(x + ' ' + 'I-PER' + '\n')
            f_save.write('\n')
    f_save.close()
Пример #10
0
def splitFile(docName, encodingType):
    '''
		default code style of docName : encodingType
		function : segmente the chinese text of docName and return 
	'''
    # all is wrote in cache -- ok ? maybe wrote in files
    f = file(docName, 'r')
    pynlpir.open(encoding='utf-8')
    contest = []
    line = f.readline()
    cou = 0
    while line:
        line = line.strip()
        cou += 1

        try:
            line = line.decode(encodingType)
            if line.find(testChar) != -1:  #delete the file header
                line = f.readline()
                continue
            temp = pynlpir.segment(line, pos_tagging=False)
            contest += temp
            line = f.readline()
        except:
            line = f.readline()
            # print '.'
            # print "err %s, %d"%(docName, cou)
    f.close()
    pynlpir.close()
    return contest
Пример #11
0
def word_segment():
    in_text = codecs.open('data/xuezhong.txt', 'r', encoding='UTF-8').read()
    pynlpir.open()

    # 添加自定义字典
    nlpir.AddUserWord(c_char_p("徐骁".encode()))
    nlpir.AddUserWord(c_char_p("老怪物".encode()))
    nlpir.AddUserWord(c_char_p("徐渭熊".encode()))
    nlpir.AddUserWord(c_char_p("徐北枳".encode()))
    nlpir.AddUserWord(c_char_p("白狐儿脸".encode()))
    nlpir.AddUserWord(c_char_p("轩辕青锋".encode()))
    nlpir.AddUserWord(c_char_p("姜泥".encode()))
    nlpir.AddUserWord(c_char_p("大官子".encode()))
    nlpir.AddUserWord(c_char_p("北凉".encode()))
    nlpir.AddUserWord(c_char_p("小和尚".encode()))

    # 对文件分词
    nlpir.FileProcess('data/xuezhong.txt'.encode("utf-8"),
                      'data/xuezhong_seg_1.txt'.encode("utf-8"), False)

    # key_words = pynlpir.get_key_words(in_text, max_words=100, weighted=True)
    pynlpir.close()
    print(key_words)

    print("segment finished")
Пример #12
0
    def setUp(self):
        try:
            pynlpir.cli.update_license_file(DATA_DIR)
        except URLError:
            pass

        pynlpir.open()
Пример #13
0
 def init(self, filename=TRAINSETFILE, IsTraining=True, IsSegment=True):
     with open(filename, encoding='GB18030') as file:
         filereader = csv.reader(file,
                                 dialect='excel-tab',
                                 quoting=csv.QUOTE_NONE)
         if not IsSegment:
             for item in filereader:
                 self.userlist.append(item)
         else:
             pynlpir.open()
             if IsTraining:
                 infoflag = 4
             else:
                 infoflag = 1
             # count_test =0
             for userquery in filereader:
                 userdict = {}
                 self.userinfo.append(userquery[:infoflag])
                 for item in userquery[infoflag:]:
                     for word in pynlpir.segment(item, pos_tagging=False):
                         if word not in self.dict.keys():
                             self.dict[word] = 0
                         if word in userdict.keys():
                             userdict[word] += 1
                         else:
                             userdict[word] = 1
                 self.userlist.append(userdict)
                 # count_test +=1
                 # if count_test>100:
                 #   break
             pynlpir.close()
     self.IsTraining = IsTraining
     self.IsSegment = IsSegment
     self.IsDF = False
Пример #14
0
def drive_start(tag_flag = True):
    targ_name = u'汇总词典.dic'
    all_dicts = {u'药物':'MED', u'疾病':'DIS', u'症状':'SYM', u'手术检查':'TRE'}
    ner_mm_rule.map_dict('./tag/dictionary', all_dicts, targ_name)
    pynlpir.open()
    if tag_flag == True:
        tag_ner()
Пример #15
0
def main(input_file, output_file):
    pynlpir.open()
    fw = open(output_file, 'w+', encoding='utf-8')
    pos2id = get_pos_map()
    data = read_corpus(input_file)
    for _sent, _tags in data:
        sent = ''.join(_sent)
        result = pynlpir.segment(sent,
                                 pos_tagging=True,
                                 pos_names='parent',
                                 pos_english=True)
        # print(result)
        i = 0
        for _word, _speech in result:
            for j in range(len(_word)):
                char = _word[j]
                speech = ''
                if _speech is None or _speech not in reserve_pos_list:
                    speech = 'O'
                else:
                    speech = '-'.join(_speech.split(' '))
                    if j == 0:
                        speech = 'B-' + speech
                    else:
                        speech = 'I-' + speech
                if i >= len(_tags):
                    print(i, len(_sent), _sent)
                fw.write(char + ' ' + _tags[i] + ' ' + speech + '\n')
                i += 1
        fw.write('\n')
    fw.close()
    pynlpir.close()
Пример #16
0
 def get_tokenised_parts(self):
     pynlpir.open()
     for s in self.sentences:
         sen_parts = re.split('[?!.,。,?!]', s)
         for sen_part in sen_parts:
             tokens = pynlpir.segment(sen_part)
             yield tokens
Пример #17
0
    def train(self):
        # df_table = {"valid": {"science": 35, "physics": 34, "robot": 57}, "invalid": {"fat": 30, "large": 34, "cheap": 55}}
        # The number of articles containing "science", "physics" or "robot"
        # prior_table = {"valid": 183, "invalid": 244}
        pynlpir.open()
        prior_table = {ele: 0 for ele in self.category_list}
        posterior_table = {ele: dict() for ele in self.category_list}

        i = 0
        for sample in self.training_set_material:
            buffer = sample.split("\t")
            text = buffer[0]
            seg_words = pynlpir.segment(text, pos_tagging=False)
            words_set = set(seg_words)
            try:
                label = buffer[1]
            except:
                print("Line " + str(i) + "in training set corrupted")
                continue
            prior_table[label] += 1
            for word in words_set:   # all words in the text
                if word in posterior_table[label].keys():
                    posterior_table[label][word] += 1   # posterior count +1 when this word already exists in posterior
                else:
                    posterior_table[label][word] = 1  # posterior count assigned to 1 when this word does exist in posterior yet
            i += 1
        return prior_table, posterior_table
Пример #18
0
def fenci(content):
    dict = {}
    # pr.open()
    # dicConf=GetDicConfig()
    # FilePath=dicConf['Testfilepath']
    # DicNews=GetDictFromJsonFile(FilePath)
    # content=DicNews['content']
    pr.open()
    segs = pr.segment(content, pos_english=False, pos_names='child')
    AllList = []
    NamedList = []
    OtherList = []
    for w, c in segs:
        if len(w) < 2:
            continue
        else:
            AllList.append(w)
            if c == '地名' or c == '人名':
                NamedList.append(w)
            else:
                OtherList.append(w)
    #print("NameList=",NamedList)
    #print('OtherList=',OtherList)
    #print('Alllist=',AllList)
    dict.update({'NameList': NamedList})
    dict.update({'OtherList': OtherList})
    dict.update({'AllList': AllList})
    pr.close()
    return dict
Пример #19
0
    def setUp(self):
        try:
            pynlpir.cli.update_license_file(DATA_DIR)
        except URLError:
            pass

        pynlpir.open()
Пример #20
0
def seg2(curpus_path, seg_path, seg_test_path):
    pynlpir.open(encoding='gbk')
    check_dir_exist(seg_path)
    check_dir_exist(seg_test_path)
    cat_folders = os.listdir(curpus_path)
    i = 0
    for folder in cat_folders[::-1]:
        folds_path = os.path.join(curpus_path, folder)
        folds = os.listdir(folds_path)
        for fold in folds:
            files_path = os.path.join(folds_path, fold)
            files = os.listdir(files_path)
            for file in files:
                i += 1
                from_file = os.path.join(files_path, file)
                if i < 55000:
                    to_file = os.path.join(seg_path, str(i) + '.txt')
                elif i < 125000:
                    to_file = os.path.join(seg_test_path,
                                           str(i - 55000) + '.txt')
                else:
                    pynlpir.close()
                    return
                nlpir.FileProcess(from_file.encode('UTF-8'),
                                  to_file.encode('UTF-8'), True)
                content = readfile(to_file, encoding='gbk')
                pat = re.compile(u'\s+([\u4e00-\u9fa5]+)/n')
                result = pat.findall(str(content))
                write(to_file, ' '.join(result))
Пример #21
0
def open(conf=CONF_PATH):
    global CONF_PARSER
    if not CONF_PARSER:
        if not os.path.isfile(conf):
            raise IOError('Config file is not exist.')
        CONF_PARSER = ConfigParser()
        CONF_PARSER.read(conf)
    pynlpir.open()
Пример #22
0
    def __init__(self):
        # load stopwords
        f = open('../data/stopwords.txt', 'r')
        lines = f.readlines(-1)
        for line in lines:
            self.stopwords.add(line.strip().decode('utf-8'))

        pynlpir.open()
Пример #23
0
def cut(data_list):
    """
    分词
    """
    pynlpir.open()
    data_list = [(pynlpir.segment(x)) for x in data_list]
    pynlpir.close()
    return data_list
Пример #24
0
def wordSegment(file):
    pynlpir.open()
    des_filename = '/home/cc/Desktop/sentences.txt'
    with open(des_filename, 'w') as fout:
        for line in file:
            r = pynlpir.nlpir.ParagraphProcess(line, False)
            fout.write(r + '\n')
    pynlpir.close()
Пример #25
0
def main():
    # 设置结果保存的目录
    result_dir = r'D:\semantic analysis\新结果\共现网络//'
    txt_dir = r"D:\semantic analysis\新纯文本\1常用词//"
    # k_list = util.get_key_list()
    # k_list = ['不约而同', '喜闻乐见', '努力', '感觉', '简单', '无聊', '希望', '美好']
    # 中心词
    k_list = ['美好']
    # 结巴分词词典的目录
    # jieba.set_dictionary("D:\semantic analysis\分词\词库\导出结果\dict1.txt")
    # jieba.initialize()
    pynlpir.open()
    for key in k_list:
        print(key)
        pynlpir.nlpir.AddUserWord(c_char_p(key.encode()))

    for key in k_list:
        print(key)
        # 文件目录
        file_list = util.get_file_list(txt_dir + key, ".txt")
        # 建立目录
        mk_dir(result_dir + key)
        # mk_dir(result_dir+key+'//w')
        mk_dir(result_dir + key + '//p')

        for n_file in file_list:
            s_list = util.get_list_from_file(txt_dir + key + "//" + n_file)
            # 过滤相同的语句,防止重复计算
            print(len(s_list))
            s_list = list(set(s_list))
            print(len(s_list))

            # 生成所有句子的网络
            # ps_list, mn, pps_list,pmn = create_matrix(s_list,key)
            pps_list, pmn = create_matrix(s_list, key)

            pkl_name = n_file[:-4] + '.pkl'

            # for w_list in ps_list:
            #     # 创建整句话的网络
            #     mn.add_edges(w_list)
            # util.save_nw(mn.get_network(), result_dir+key+'//w//' + pkl_name)

            for w_list in pps_list:
                # pmn.add_edges(w_list)
                pmn.add_gram_edges(w_list)
            util.save_nw(pmn.get_network(),
                         result_dir + key + '//p//' + pkl_name)

            print(n_file)
            print(
                time.strftime('%Y-%m-%d %H:%M:%S',
                              time.localtime(time.time())))

            with open(result_dir + key + '//record.txt', 'a',
                      encoding='utf-8') as rf:
                rf.write(n_file + '\n')
    pynlpir.close()
Пример #26
0
    def segment(self):
        """
        fni:  str;  input file name with path
        fno:  str;  output file name with path
        lang: str;  language code
        pos:  bool; POS tags included
        n:    int;  no. of lines processed
        """
        import copy
        from PyQt5.QtWidgets import QApplication
        from opencc import OpenCC

        openCC = OpenCC('t2s')  # convert from Traditional-to-Simplified
        pynlpir.open(encoding="utf-8")
        print("Finished initializing ITCLAS/NLPIR")
        count = lineCount(self.fni)
        fit  = open(self.fni, "r", encoding="UTF-8")
        fot  = open(self.fno, "w", encoding="UTF-8", newline="\n")

        sep = " " # separator of Chinese tokens (space by default)
        n = 0
        for linet in fit:

            n += 1
            if (linet.strip() == ''): # empty string
                fot.write("\n")
                continue
            lines = openCC.convert(linet.strip())
            lines_seg = pynlpir.segment(lines, pos_tagging=True, pos_names=None) 
            # segment with optional POS-tagging

            # The following segments the zht text according to the
            # segmentation patterns obtained from NLPIR above
            tokens   = []  # initialize list to hold 'words' of segmented zht line
            pos_tags = []  # initialize list to hold pos tags of segmented words
            while len(lines_seg) > 0:  # loop until nothing is left in lines_seg
                t, p = lines_seg.pop(0)  # remove leftmost zhs token and save to variable t0
                m = len(t)  # no. of characters in token
                tokens.append(linet[0:m])  # add corresponding zht token to tokens[]
                pos_tags.append(p)
                linet = linet[m:]  # delete token from zht line (from beginning of string)

            #fot.write(sep.join(tokens)+"\n")  # wirte zht-seg output
            tok_pos = ["{}_{}".format(x, y) for x,y in zip(tokens, pos_tags)]  # list of tok_pos pairs
            fot.write(sep.join(tok_pos)+"\n")
            #if (n == 1): break
            if n % 50 == 0:
                self.window.ui.progressBar.setValue(round(100 * n / self.fi_linecount, 0))
                self.window.ui.progressBar.repaint()
                QApplication.processEvents()
        self.window.ui.progressBar.setValue(100)
        self.window.ui.progressBar.repaint()

        fit.close()
        fot.close()
        pynlpir.close()
        self.numLineProcessed = n
        return n
Пример #27
0
def get_words(sentence,query_id):
    #输入为一个string的句子,输出为这个句子的分解的单词
    pynlpir.open()
    print 'sentence : ' + str(sentence)
    try:
        sentence_words_list = pynlpir.segment(sentence,pos_tagging=False)
        return sentence_words_list
    except BaseException:
        return ['ERROR',str(query_id)]
Пример #28
0
    def run(self):
        starttime = time.time()
        pynlpir.open()

        print('PyNLPIR:')
        print("".join(
            [(x + '/ ') for x in pynlpir.segment(self.test_text, pos_tagging=False) if x not in self.stopwords]))
        endtime = time.time()
        print('time cost:' + str(round((endtime - starttime), 4)) + ' seconds.\n')
Пример #29
0
def build_word_vectors(infile_name, outfile_name):
    print('building word vectors...')
    pynlpir.open()
    jieba.initialize()

    df = pd.read_json(infile_name)
    with open(outfile_name, 'w') as f:
        for content in tqdm(df.article_content):
            f.write(' '.join(word_tokenize(content)))
Пример #30
0
def nlpir_seg_txt(text):
    # nlpir 分词并用去停用词
    pynlpir.open()
    # 分词且去停用词
    segments = pynlpir.segment(text, pos_tagging=False, pos_english=False)
    # 去停用词
    remove_segments = [word for word in segments if word not in stopwords and len(word) > 1]
    pynlpir.close()
    return remove_segments
def extract_text_type(user, text, user_dict, entity_dict, keyword_dict,
                      topic_word_list, all_topic_list):
    pynlpir.open()
    user_id = user_dict[user]
    entity_list = find_raw_entity(text, entity_dict)
    keyword_list = find_keyword(text, keyword_dict)
    topic_id = find_topic(text, topic_word_list, all_topic_list)
    pynlpir.close()
    return user_id, entity_list, keyword_list, topic_id
Пример #32
0
def segment(path='F:/Data/Chinese/chinese.json',
            json_path='F:/Data/Chinese/chinese_token.json'):
    """
    NLPIR分词+根据词性清洗+去掉为问题或回答空的项
    :param path: 源数据路径
    :param json_path: 结果保存路径
    :return:
    """
    # 启动分词工具
    pynlpir.open()
    # 只保留文本部分,并分词,根据词性过滤
    # 保留以下词性的词,并去除词性标记
    # 词性含义请查看https://github.com/tsroten/pynlpir/blob/master/pynlpir/pos_map.py
    word_filter = {
        'noun', 'time word', 'locative word', 'noun of locality', 'verb',
        'adjective', 'distinguishing word', 'status word', 'numeral'
    }
    # 清除分词异常的数据
    question_id_filter = {294118450, 300106271, 291834409}

    # 边读边处理边写入文件,减少内存消耗
    count = 0
    with open(path, 'r') as f_in, open(json_path, 'w') as f_out:
        for line in f_in:
            q = json.loads(line)
            if q['question_id'] in question_id_filter:
                continue
            # 干掉有换行的情况 小写化
            if '\n' in q['question']:
                print 'question:'
                print q['question']
                q['question'] = q['question'].replace('\n', ' ')
            q['question'] = [
                w[0] for w in pynlpir.segment(q['question'].lower())
                if w[1] in word_filter and w[0] != u''
            ]

            for a in q['answers']:
                # 干掉有换行的情况
                if '\n' in a['answer']:
                    print 'answer:'
                    print a['answer']
                    a['answer'] = a['answer'].replace('\n', ' ')
                a['answer'] = [
                    w[0] for w in pynlpir.segment(a['answer'].lower())
                    if w[1] in word_filter and w[0] != u''
                ]
            # 清除回答为空
            q['answers'] = [a for a in q['answers'] if len(a['answer']) > 0]
            count = count + 1
            if count % 1000 == 0:
                print count
            # 清除回答列表为空和问题为空的
            if len(q['question']) > 0 and len(q['answers']) > 0:
                f_out.write(json.dumps(q))
                f_out.write('\n')
    pynlpir.close()
Пример #33
0
def word_segment(text):
    pynlpir.open()
    segments = nlpir.segment_pos(text)
    segment_result = []
    pos_result = []
    for segment in segments:
        segment_result.append(segment[0])
        pos_result.append(segment[1])
    pynlpir.close()
    return segment_result, pos_result
Пример #34
0
def nlpir_keywords(text,n):
	pynlpir.open()
	# print '关键词测试:\n'
	key_words = list(pynlpir.get_key_words(text,n,weighted=False))
	# for key_word in key_words:
	#     print key_word[0], '\t', key_word[1]
	 
	pynlpir.close()
	
	print key_words
	return key_words
Пример #35
0
def get_key_words(text):
    pynlpir.open()
    result = []
    keywords = pynlpir.get_key_words(text, weighted=True)
    if len(keywords) == 0:
        return result
    for i in range(len(keywords)):
        keyword = keywords[i][0]
        result.append(keyword)
    pynlpir.close()
    return result
def words_cixing(question,pos=1):
    #pos=1,标注词性;否则不标注
    pynlpir.open()
    if pos:
        pos1=['{}/{}'.format(k,v)for k,v in pynlpir.segment(question, pos_names=None,pos_tagging=pos)]
    else:
        pos0=pynlpir.segment(question)
    pynlpir.close()
    if pos:
        return pos1
    else :
        return pos0
Пример #37
0
 def query2words(self,query):
   words = []
   segs = query.split(' ')
   for s in segs:
     s = s.strip() ## need regularization
     if s in self.vocab: words.append(s) ## in word2vec vocab
     else:
       pynlpir.open()
       # words.extend(pynlpir.get_key_words(query,max_words=3))
       word_segs = pynlpir.segment(query,pos_tagging=False)
       for word in word_segs:
         if word not in self.stop_list: words.append(word)
       print(words)
   return words
Пример #38
0
 def _part_document(self):
   pynlpir.open()
   docs = {}
   for dirname, dirnames,filenames in os.walk('dependence/new_data'):
     for filename in filenames:
       path = os.path.join(dirname, filename)
       text = ''
       with io.open(path, 'r',encoding='utf-8') as f:
         text = f.readline()
         words = pynlpir.segment(text,pos_tagging=False)
         clean_words = [w for w in words if w not in self.stop_list and len(w)>1]
         index = filename[:6]
         docs[index] = clean_words
   dictionary = corpora.Dictionary(docs.values())
   corporas = {index: dictionary.doc2bow(docs[index]) for index in docs}
   return docs, dictionary, corporas
Пример #39
0
 def test_license_auto_update(self):
     """Tests that the auto-update of the license works."""
     try:
         # switch old one to the new one
         os.rename(os.path.join(DATA_DIR, LICENSE_NAME),
                   os.path.join(DATA_DIR, "{}.copy".format(LICENSE_NAME)))
         os.rename(os.path.join(DATA_DIR, "{}.old".format(LICENSE_NAME)),
                   os.path.join(DATA_DIR, LICENSE_NAME))
         pynlpir.open()
         pynlpir.close()
     finally:
         # switch back the license
         os.rename(os.path.join(DATA_DIR, LICENSE_NAME),
                   os.path.join(DATA_DIR, "{}.old".format(LICENSE_NAME)))
         os.rename(os.path.join(DATA_DIR, "{}.copy".format(LICENSE_NAME)),
                   os.path.join(DATA_DIR, LICENSE_NAME))
Пример #40
0
    def predict(self, text):
        # words = [word1, word2, word3, ...]
        pynlpir.open()
        seg_words = pynlpir.segment(text, pos_tagging=False)
        words_set = set(seg_words)
        result = dict()
        for category in self.category_list:
            prob = self.comp_prop(category, words_set)
            result[category] = prob

        """
        buffer = [result[my_key] for my_key in result.keys()]
        score_sum = sum(buffer)
        # result = {my_key: result[my_key]/score_sum for my_key in result.keys()}
        """
        buffer = list(result.items())
        buffer.sort(key=lambda x: x[1], reverse=True)
        top_category = buffer[0][0]
        return top_category
 def __call__(self, value, positions=False, chars=False,
              keeporiginal=False, removestops=True,
              start_pos=0, start_char=0, mode='', **kwargs):
     assert isinstance(value, text_type), "%r is not unicode" % value
     t = Token(positions, chars, removestops=removestops, mode=mode,
         **kwargs)
     nlpir.Init(nlpir.PACKAGE_DIR, nlpir.UTF8_CODE)
     pynlpir.open()
     pynlpir.open(encoding='utf-8')
     seglist = pynlpir.segment(value,)
     for w in seglist:
         t.original = t.text = w
         t.boost = 1.0
         if positions:
             t.pos=start_pos+value.find(w)
         if chars:
             t.startchar=start_char+value.find(w)
             t.endchar=start_char+value.find(w)+len(w)
         yield t      #通过生成器返回每个分词的结果token
Пример #42
0
 def document2sentences(self,document):
   pynlpir.open()
   words = pynlpir.segment(document,pos_tagging=False)
   sign = ['。', ';', '.', ';']
   pause_position = []
   for i in range(len(words)):
     if words[i] in sign: pause_position.append(i)
   setences = []
   if len(pause_position) == 0:
     clean_d = [s.strip() for s in words if s not in self.stop_list]
     setences.append(' '.join(clean_d)+'\n')
   else:
     for i in range(len(pause_position)):
       setence = []
       if i == 0: setence = words[:pause_position[i]]
       elif i == len(pause_position)-1 and i != 0: break
       else: setence = words[pause_position[i]:pause_position[i+1]]
       clean_s = [s.strip() for s in setence if s not in self.stop_list]
       setences.append(' '.join(clean_s)+'\n')
   return setences
Пример #43
0
def read_lexical_datas(file, compose_func=None):
    pynlpir.open()
    f = open(file, 'r', encoding='utf-8')
    tokens_list = [pynlpir.segment(line.rstrip('\n').replace('幺', '一'), pos_tagging=False) for line in f]
    if compose_func is None:
        word_idx = {}
        for tokens in tokens_list:
            for token in tokens:
                if token not in word_idx:
                    word_idx[token] = len(word_idx)
        array = numpy.zeros([len(tokens_list), len(word_idx)])
        for i, tokens in enumerate(tokens_list):
            for token in tokens:
                array[i][word_idx[token]] = 1.0
    else:
        print('reading word vectors')
        word_vecs = word_vec.read_word_vec(r'../data/vectors_cbow')
        print('reading complete')
        array = numpy.asarray([compose_func(tokens, word_vecs) for tokens in tokens_list])
    return array
Пример #44
0
def main():
    py.open()
    a = sys.argv[1]
    result = py.segment(a)
    res_str = []
    for r in result:
        if len(r[0]) == 2 and (r[1] == "noun" or r[1] == "verb" or r[1] == "adjective"):
            f_result = fsame.find(r[0])
            ff_result = fsame.ffind(r[0])
            if f_result == r[0] or ff_result == r[0]:
                res_str.append(r[0])
            else:
                if random.randint(0, 1) == 0:
                    res_str.append(f_result)
                else:
                    res_str.append(ff_result)
        else:
            res_str.append(r[0])
    print "".join(res_str)
    py.close()
def extract_news_kws(hot_news):
    pynlpir.open()
    s = hot_news
    kw_list = pynlpir.segment(s, pos_tagging=True, pos_names=None)
    kws = ""
    for kw in kw_list:
        pos = kw[0]
        tagging = kw[1]
        try:
            if tagging:
                # test if tagging is none, which means the pos is a space character
                tagging_first = tagging[0]
            else:
                tagging_first = ""
        except:
            tagging_first = ""
        if tagging_first == "n" and len(pos) > 1:
            if pos != "quot":
                kws = kws + pos + u" "
    kws = kws.strip(u" ")
    return kws
Пример #46
0
def part_sentence(stop_list):
  pynlpir.open()
  for dirname, dirnames,filenames in os.walk('dependence/ch_corporas/wiki/lost'):
      for filename in filenames:
        lines = []
        read_path = os.path.join(dirname, filename)
        rf = open(read_path,'rb')
        print(filename)
        for line in rf:
          # detector.feed(byte)
          encoding = chardet.detect(line)['encoding']
          if encoding == None: encoding = 'utf-8'
          new_line = line.decode(encoding,'ignore')
          words = pynlpir.segment(new_line,pos_tagging=False)
          clean_words = [w.strip() for w in words if w not in stop_list]
          str_line = ' '.join(clean_words)
          if str_line: lines.append(str_line+'\n')
        rf.close()
        write_path = os.path.join('dependence/ch_corporas/wiki_clean', filename)
        wf = open(write_path, 'w')
        wf.writelines(lines)
        wf.close()
Пример #47
0
    def __init__(self, content, norm="l1_norm"):
        self.norm = norm
        pynlpir.open()
        words = pynlpir.segment(content, pos_tagging=True, pos_names=None)

        kws = ""
        for word in words:
            pos = word[0]
            tagging = word[1]
            try:
                if tagging:
                    # test if tagging is none, which means the pos is a space character
                    tagging_first = tagging[0]
                else:
                    tagging_first = ""
            except:
                tagging_first = ""
            if tagging_first == "n" and len(pos) > 1:
                if pos != "quot":
                    kws = kws + pos + u" "
        result = kws.split(" ")
        self.PoS = result
def tokenize(file):
    words = []
    pynlpir.open()
    directory = '\\resources\\original files\\htl_del_4000\\'
    posWords = codecs.open(directory + file + 'Words.txt', 'w+', 'utf-8')
    with codecs.open(directory + file + '.txt', 'r', 'utf-8') as posFile:
        for s in posFile.readlines():
            # print posFile.readline()
            a = pynlpir.segment(s, pos_tagging=False)
            # print a
            for i in range(len(a)):
                # print a[i]
                if i != (len(a) - 1):
                    # print 'i='+str(i)
                    # print 'a='+str(len(a))
                    posWords.write(a[i] + ' ')
                else:
                    posWords.write(a[i] + '\r')
                    # for i in a:
                    #    posWords.write(i + ';')
                    # posWords.write('\0')
    posWords.close()
Пример #49
0
	def post(self,request):
		obj_id = request.POST['obj_id']
		school = MySchool.objects.get(id=int(obj_id))
		feeds = []

		# weibo
		# App Key:802677147
		# App Secret:f75be23800d779cc9dbbf6b467b7ff61		
		# Redirect url: https://api.weibo.com/oauth2/default.html
		# code: 4ccb7879bf204466b80e02c106d09727

		# read baidu
		params = {'keyword':school.name}

		# send a 3rd party service request
		baidu_consumer.delay(params)

		# read saved feeds
		feeds = MyBaiduStream.objects.filter(school=school).order_by('-last_updated')[:100]
		content = loader.get_template(self.template_name)
		tieba_html= content.render(Context({
			'obj':school,
			'feeds': feeds,
			}))

		# hot topics
		pynlpir.open() # must have this line!
		topics = feeds[:50]
		content = loader.get_template(self.newsticker_template_name)
		newsticker_html= content.render(Context({
			'objs':topics,
			'keywords': pynlpir.get_key_words(''.join([f.name+f.description for f in feeds]), max_words=50, weighted=True)
			}))
		pynlpir.close()

		return HttpResponse(json.dumps({'bd_html':tieba_html,'news_html':newsticker_html}), 
			content_type='application/javascript')
import csv
import sys
import pynlpir

csv.field_size_limit(sys.maxsize)

pynlpir.open()

n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/places_raw.txt")
print "num imported: " + str(n)

n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/pop_raw.txt")
print "num imported: " + str(n)

n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/poprecommend_raw.txt")
print "num imported: " + str(n)

n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/politics_raw.txt")
print "num imported: " + str(n)

n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/0.txt")
print "num imported: " + str(n)

n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/1.txt")
print "num imported: " + str(n)

n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/2.txt")
print "num imported: " + str(n)

n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/3.txt")
print "num imported: " + str(n)
Пример #51
0
 def __init__(self, file):
     with open(file, 'r', encoding="utf-8") as f:
         content = f.read()
         pynlpir.open()
         result = pynlpir.segment(content, pos_tagging=False)
     self.PoS = result
def main(argv):
	csv.field_size_limit(sys.maxsize)

	pynlpir.open()
	#load sogou dictionaries

	n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/places_raw.txt")
	print "num imported: " + str(n)

	n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/pop_raw.txt")
	print "num imported: " + str(n)

	n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/poprecommend_raw.txt")
	print "num imported: " + str(n)

	n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/politics_raw.txt")
	print "num imported: " + str(n)

	n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/0.txt")
	print "num imported: " + str(n)

	n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/1.txt")
	print "num imported: " + str(n)

	n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/2.txt")
	print "num imported: " + str(n)

	n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/3.txt")
	print "num imported: " + str(n)

	n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/4.txt")
	print "num imported: " + str(n)

	#write out parsed words
	# files = ["week" + str(i) + ".csv" for i in range(4,53)]

	# for fi in files:
	fi = "week" + str(argv[0]) + ".csv"
	print fi
	# for fi in files:
	infile = "./../" + fi
	a = fi.split('.')
	outfile = "./../parsed/" + a[0] + "parsed.csv"

	f = open(infile, 'rb')
	of = open(outfile, 'wb')
	reader = csv.reader(f, delimiter=",")
	writer = csv.writer(of, delimiter=",", quotechar='|', quoting=csv.QUOTE_MINIMAL)
	count = 0
	total = sum(1 for row in reader)
	print total
	f.seek(0)
	errors = 0
	unbounderrors = 0

	for row in reader:
		mid = row[0]
		message = row[6]
		censored = None
		try:
			segmented = pynlpir.segment(message)
		except UnicodeDecodeError:
			errors += 1
			continue
		except UnboundLocalError:
			unbounderrors += 1
			print "what??"
			continue
		except:
			print "core dump...?"
			continue

		mString = ""
		for segment in segmented:
			mString += segment[0]
			mString += " "

		if row[10]!="":
			censored = 1
		else:
			censored = 0

		writer.writerow([mid, censored, mString.encode("utf-8")])

		# progress
		if count%1000 == 0:
			print str(count) + "/" + str(total) + "\r",
			sys.stdout.flush()
		count += 1


	print "count: " + str(count)
	print "errors: " + str(errors)
	print "unbounderrors: " + str(unbounderrors)
def main():
    csv.field_size_limit(sys.maxsize)

    pynlpir.open()
    # load sogou dictionaries

    n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/places_raw.txt")
    print "num imported: " + str(n)

    n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/pop_raw.txt")
    print "num imported: " + str(n)

    n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/poprecommend_raw.txt")
    print "num imported: " + str(n)

    n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/politics_raw.txt")
    print "num imported: " + str(n)

    n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/0.txt")
    print "num imported: " + str(n)

    n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/1.txt")
    print "num imported: " + str(n)

    n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/2.txt")
    print "num imported: " + str(n)

    n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/3.txt")
    print "num imported: " + str(n)

    n = pynlpir.nlpir.ImportUserDict("./../Scel2Txt/weibo-data/data/4.txt")
    print "num imported: " + str(n)

    # write out parsed words
    files = ["week" + str(i) + ".csv" for i in range(1, 53)]

    print "loading user data into map"
    userdict = dict()
    userdata = "./../userdata.csv"
    with open(userdata, "rb") as users:
        reader = csv.reader(users, delimiter=",")
        for row in reader:
            userdict[row[0]] = [row[1], row[2], row[3]]

    outfile = "full_uncensored_sample.csv"
    addedCount = 0

    for fi in files:
        # fi = "week" + str(argv[0]) + ".csv"
        print fi
        # for fi in files:
        infile = "./../" + fi
        a = fi.split(".")

        f = open(infile, "rb")
        of = open(outfile, "a")
        reader = csv.reader(f, delimiter=",")
        writer = csv.writer(of, delimiter=",", quotechar="|", quoting=csv.QUOTE_MINIMAL)
        count = 0
        total = sum(1 for row in reader)
        print total
        f.seek(0)
        errors = 0
        unbounderrors = 0

        for row in reader:
            if count % 2421 == 0 and row[10] == "":
                mid = row[0]
                message = row[6]
                censored = 0
                try:
                    segmented = pynlpir.segment(message)
                except UnicodeDecodeError:
                    errors += 1
                    continue
                except UnboundLocalError:
                    unbounderrors += 1
                    print "what??"
                    continue
                except:
                    print "core dump...?"
                    continue

                mString = ""
                for segment in segmented:
                    mString += segment[0]
                    mString += " "

                try:
                    d = userdict[row[2]]
                except KeyError:
                    # print "no key for userid " + row[2]
                    continue
                except ValueError:
                    print "ValueError"
                    continue

                writer.writerow(
                    [mid, censored, mString.encode("utf-8"), row[1], row[2], row[5], row[8], row[9], d[0], d[1], d[2]]
                )
                addedCount += 1
                # progress
            if count % 1000 == 0:
                print str(count) + "/" + str(total) + "\r",
                sys.stdout.flush()
            count += 1

        print "addedCount: " + str(addedCount)
        print "count: " + str(count)
        print "errors: " + str(errors)
        print "unbounderrors: " + str(unbounderrors)
Пример #54
0
def createDocMapAndClickInfo(total_set_file, doc_set_file):
    doc_map1 = {}  #doc map(Facilitate the calculation in PLSA)
    doc_map2 = {}  #not use
    user_set = set()  #total users
    doc_set = set()  #total documents
    doc_click_count = {}  #clicks in every document
    user_doc_click_count = {}  #clicks in specific document from specific user

    if os.path.isfile(doc_set_file):
        is_write_need_file = True
    else:
        is_write_need_file = False
    fp_total_set = open(total_set_file, 'r')
    if is_write_need_file == False:
        fp_doc_set = open(doc_set_file, 'w')
        fp_doc_map1 = open('data//doc_map1.csv', 'w')
        fp_doc_map2 = open('data//doc_map2.csv', 'w')
        fp_doc_click_count = open('data//doc_click_count.csv', 'w')
        fp_user_doc_click_count = open('data//user_doc_click_count.csv', 'w')
    cnt = 0
    pynlpir.open()
    for line in fp_total_set:
        word = line.split('\t')
        user_set.add(word[0])
        doc_set.add(word[1])
        doc_click_count.setdefault(word[1], 0)
        doc_click_count[word[1]] += 1
        user_doc_click_count.setdefault(word[0], {})
        if user_doc_click_count[word[0]].has_key(word[1]) == False:
            user_doc_click_count[word[0]][word[1]] = 0
        user_doc_click_count[word[0]][word[1]] += 1
        if doc_map1.has_key(word[1]) == False:
            doc_map1[word[1]] = cnt
            doc_map2[cnt] = word[1]
            cnt += 1
            if is_write_need_file == False:
                # title_split_result = pynlpir.nlpir.ParagraphProcess(word[4], True)
                content_split_result = pynlpir.nlpir.ParagraphProcess(word[5], True)
                fp_doc_set.write('%s\t%s' %(word[1], content_split_result))#, content_split_result))

    # doc_map = sorted(doc_map1.items(), key=lambda d:d[1], reverse=False)
    if is_write_need_file == False:
        for d, dtag in doc_map1.items():
            fp_doc_map1.write('%s %d\n' %(d, dtag))
        for dtag, d in doc_map2.items():
            fp_doc_map2.write('%d %s\n' %(dtag, d))
        for d, dclicks in doc_click_count.items():
            fp_doc_click_count.write('%s %d\n' %(d, dclicks))
    user_clicks = 0
    for u, uitem in user_doc_click_count.items():
        for d in uitem.keys():
            if is_write_need_file == False:
                fp_user_doc_click_count.write('%s %s %d\n' %(u, d, uitem[d]))
            user_clicks += uitem[d]
    print 'user clicks = ', user_clicks

    pynlpir.close()
    if is_write_need_file == False:
        fp_doc_set.close()
        fp_total_set.close()
    print 'number of users:', len(user_set)
    print 'number of documents:', len(doc_set)

    print 'createDocMap end'
    return user_set, doc_set, doc_map1, doc_map2, doc_click_count, user_doc_click_count
Пример #55
0
def import_userdict(file_dir):
    pynlpir.open()
    nlpir.import_userdict(file_dir)
    pynlpir.close()
Пример #56
0
 def setUp(self):
     try:
         pynlpir.open()
     except pynlpir.LicenseError:
         pynlpir.cli.update_license_file(DATA_DIR)
         pynlpir.open()