Пример #1
0
    def wordToVocabulary(self, originFile, vocabFile, segementFile):
        # stopwords = [i.strip() for i in open(self.stopwordsFile).readlines()]
        # print(stopwords)
        # exit()
        vocabulary = []
        sege = open(segementFile, "w")
        with open(originFile, 'r') as en:
            for sent in en.readlines():
                # 去标点
                if "enc" in segementFile:
                    sentence = sent.strip()
                    words = jieba.lcut(sentence)
                    print(words)
                else:
                    words = jieba.lcut(sent.strip())
                vocabulary.extend(words)
                for word in words:
                    sege.write(word + " ")
                sege.write("\n")
        sege.close()

        # 去重并存入词典
        vocab_file = open(vocabFile, "w")
        _vocabulary = list(set(vocabulary))
        _vocabulary.sort(key=vocabulary.index)
        _vocabulary = self.vocab + _vocabulary
        for index, word in enumerate(_vocabulary):
            vocab_file.write(word + "\n")
        vocab_file.close()
def search(request):
	if   request.user.is_authenticated():
		if request.GET:
			input_word=request.GET["search"]
			seg_list= jieba.lcut(input_word, cut_all=True)
			result=[]
			for art in Art.objects.all():
				art_name_seg=jieba.lcut(art.name ,cut_all=True)
				for key in seg_list:
					if any(key==name for name in art_name_seg):
						result.append(art)
					elif any(key==style for style in art.tag_set.all()[0].style):
						result.append(art)
					elif any(key==word for word in jieba.lcut(art.tag_set.all()[0].description ,cut_all=True)):
						result.append(art)
			
		if request.POST:
			name_collection=request.POST['getcollection']
			Collection.objects.get_or_create(collecter=request.user,defaults={'collecter':request.user})
			Art.objects.update_or_create(name=name_collection,defaults={'collecter':Collection.objects.filter(collecter=request.user)})

		return render_to_response('search.html',RequestContext(request,locals()))


	else:
		return HttpResponseRedirect("/login/")
    def __init__(self):
        # load the jieba user.f_dict
        root_path = os.path.dirname(os.path.abspath(__file__))
        jieba.load_userdict(os.path.join(root_path, "f_dict/user.dict"))

        # get the positive f_corpus and length
        self.pos_doc_list = []
        with open(os.path.join(root_path, "f_corpus/waimai/positive_corpus_v1.txt"), encoding="utf-8") as pos_f:
            for line in pos_f:
                # self.pos_doc_list.append(list(set(jieba.lcut(line.strip()))))
                self.pos_doc_list.append(jieba.lcut(line.strip()))
        self.pos_doc_length = len(self.pos_doc_list)

        # get the negative f_corpus and length
        self.neg_doc_list = []
        with open(os.path.join(root_path, "f_corpus/waimai/negative_corpus_v1.txt"), encoding="utf-8") as pos_f:
            for line in pos_f:
                # self.neg_doc_list.append(list(set(jieba.lcut(line.strip()))))
                self.neg_doc_list.append(jieba.lcut(line.strip()))
        self.neg_doc_length = len(self.neg_doc_list)

        # define the variable about train number
        self.pos_train_num = 0
        self.neg_train_num = 0
        self.pos_test_num = 0
        self.neg_test_num = 0

        runout_content = "You are using the waimai f_corpus version 1.0.\n"
        runout_content += "There are total %d positive and %d negative f_corpus." % \
                          (self.pos_doc_length, self.neg_doc_length)
        print(runout_content)
Пример #4
0
 def preprocess():
     # 提取正类分词结果,用pos_word_doc的list存放每篇正类文档的分词结果;用pos_word_set的set存放所有正类文档的分词
     dir1 = os.curdir + "\\data\\Y-cut"
     files1 = os.listdir(dir1)
     pos_word_doc = list()
     pos_word_set = set()
     for name in files1:
         if name.endswith(".txt"):
             filename = dir1 + "\\" + name
             file = open(filename, "r")
             content = file.readlines()
             word_list = list()
             for line in content:
                 line.decode("utf-8")
                 seg_list = jieba.lcut(line, cut_all=False)
                 word_list.extend(seg_list)
             word_set = set(word_list)
             pos_word_doc.extend(list(word_set))
             pos_word_set = pos_word_set | word_set
             file.close()
     # 提取负类分词结果,用neg_word_doc的list存放每篇负类文档的分词结果;用neg_word_set的set存放所有正类文档的分词
     dir2 = os.curdir + "\\data\\N-cut"
     files2 = os.listdir(dir2)
     neg_word_doc = list()
     neg_word_set = set()
     for name in files2:
         if name.endswith(".txt"):
             filename = dir2 + "\\" + name
             file = open(filename, "r")
             content = file.readlines()
             word_list = list()
             for line in content:
                 line.decode("utf-8")
                 seg_list = jieba.lcut(line, cut_all=False)
                 word_list.extend(seg_list)
             word_set = set(word_list)
             neg_word_doc.extend(list(word_set))
             neg_word_set = neg_word_set | word_set
             file.close()
     # 正/负类所有文档的分词结果set all_word_set
     all_word_set = pos_word_set | neg_word_set
     word_dict = dict()
     print(len(all_word_set))
     m = 0
     # 统计各个词在正/负类中出现的次数 存在字典word_dict中,并输出到dict.txt文件
     for word in all_word_set:
         n1 = pos_word_doc.count(word)
         n2 = neg_word_doc.count(word)
         word_dict[word] = (n1, n2)
         m += 1
         if not (m % 100):
             print(m)
     out = open(os.curdir + "\\dict.txt", "w")
     for k in word_dict:
         out.write(k.encode("utf-8") + "\t" + str(word_dict[k][0]) + "\t" + str(word_dict[k][1]) + "\n")
     out.close()
Пример #5
0
def vectorize_2(test_words):
    input_words = jieba.lcut(test_words[0])
    print check_neg(input_words)

    #  if len(jieba.lcut(test_words[0])) < 2:
    if len(jieba.lcut(test_words[0])) < 2:
        return None, False
    else:
        v = HashingVectorizer(tokenizer=comma_tokenizer, stop_words=stopwords, n_features=100000, non_negative=True)
        test_data = v.fit_transform(test_words)
        print test_data
        return test_data, check_neg(input_words)
Пример #6
0
    def onlinelearning(self, input_strs, target_strs):
        input_seg = jieba.lcut(input_strs)
        target_seg = jieba.lcut(target_strs)

        input_vec = []
        for word in input_seg:
            if word not in self.enc_vocab.keys():
                vec = self.add_voc(word, "enc")
            else:
                vec = self.enc_vocab.get(word)
            input_vec.append(vec)

        target_vec = []
        for word in target_seg:
            if word not in self.dec_vocab.keys():
                vec = self.add_voc(word, "dec")
            else:
                vec = self.dec_vocab.get(word)
            target_vec.append(vec)

        with tf.Session() as sess:
            # 初始化变量
            ckpt = tf.train.get_checkpoint_state(self.model_path)
            if ckpt is not None:
                print(ckpt.model_checkpoint_path)
                self.model.saver.restore(sess, ckpt.model_checkpoint_path)
            else:
                sess.run(tf.global_variables_initializer())

            fd = self.data_iter([input_vec], [target_vec], 1, 1)
            for i in range(100):
                _, loss, _, _ = sess.run([self.model.train_op,
                                          self.model.loss,
                                          self.model.gradient_norms,
                                          self.model.updates], fd)
                checkpoint_path = self.model_path + "nlp_chat.ckpt"
                # 保存模型
                self.model.saver.save(
                    sess, checkpoint_path, global_step=self.model.global_step)

                for i, (e_in, dt_pred) in enumerate(zip(
                    fd[self.model.decoder_targets].T,
                    sess.run(self.model.decoder_prediction_train, fd).T
                )):
                    print('    sample {}:'.format(i + 1))
                    print('    dec targets > {}'.format(e_in))
                    print('    dec predict > {}'.format(dt_pred))
                    if i >= 0:
                        break
Пример #7
0
def Participle(path):
    try:
        fp = open(path, "r")
        ad = fp.readline().strip('\n')
        na = fp.readline().strip('\n')
        ti = fp.readline().strip('\n')#time
        si = fp.readline().strip('\n')
        cont = na+fp.read()
        fp.close()
    except IOError:
        return 0

    try:
        insi = {}
        insi['time'] = ti
        print(ti)
        insi['url'] = ad
        insi['title'] = na
        insi['site'] = si#decode("gb2312").encode("utf-8")
        global fnum
        global segcont
        global doc
        seg_list = jieba.lcut(cont, cut_all=False)
        stline = ""
        for word in seg_list:
            if ((word in d) is False) and word != '\n':
                stline = stline + " " + word
        segcont.append(stline)
        print (str(fnum) + " 分词")
        doc[fnum] = insi
        fnum = fnum + 1
    except UnicodeError:
        return 0
Пример #8
0
def segment(sentence, cut_type='word', pos=False, None_flag='O'):
    """
    切词
    :param sentence:
    :param cut_type: 'word' use jieba.lcut; 'char' use list(sentence)
    :param pos: enable POS
    :param None_flag: 'BIO' the 'O'
    :return: list
    """
    import logging
    jieba.default_logger.setLevel(logging.ERROR)
    if pos:
        if cut_type == 'word':
            word_pos_seq = posseg.lcut(sentence)
            word_seq, pos_seq = [], []
            for w, p in word_pos_seq:
                word_seq.append(w)
                pos_seq.append(p)
            return word_seq, pos_seq
        elif cut_type == 'char':
            word_seq = list(sentence)
            pos_seq = []
            for w in word_seq:
                w_p = posseg.lcut(w)
                pos_seq.append(w_p[0].flag)
            return word_seq, pos_seq
    else:
        if cut_type == 'word':
            return jieba.lcut(sentence)
        elif cut_type == 'char':
            return list(sentence)
Пример #9
0
def cut_to_lists(filein, filein_name):
    """ 分词并输出列表到文件

    分词后已去重
    """
    t = re.compile('\t')
    time_sep = re.compile('-')

    temp = []
    fileout = open(filein_name[:-4] + 'cut.txt', 'w', encoding='utf-8')

    for line in filein: # write number of users as demand in num_user   
        '''(uid, mid, time, forward_count,
        comment_count, like_count, content) = t.split(line)

        forward_count = int(forward_count)
        comment_count = int(comment_count)
        like_count = int(like_count)'''

        (uid, mid, time, content) = t.split(line) # for predict data

        cut_list = jieba.lcut(content)
        content = ' '.join(cut_list)

        ''' remove duplicates
        cut_list_no_dup = [] # remove duplicates
        for i in cut_list:
            if i not in cut_list_no_dup:
                cut_list_no_dup.append(i)
        '''
        fileout.write(json.dumps([uid, mid, time, cut_list]) + '\n')
Пример #10
0
def findinFiles(filename, samples): #使用tfidf
    dictionary = filetoDict(filename, dictpath)
    for file in filename:
        lines = cutlines(file)
        words = cutwords(file)
        corpus = [dictionary.doc2bow(word) for word in words]
        # 转化成tfidf
        # class gensim.models.tfidfmodel.TfidfModel(corpus=None, id2word=None, dictionary=None, wlocal=<function identity>, wglobal=<function df2idf>, normalize=True)¶
        # 转化成类
        tfidf = models.TfidfModel(corpus)
        # 调用类TfidfModel中的tfidf函数
        corpus_tfidf = tfidf[corpus]
        # 观察结果
        # for doc in corpus_tfidf:
        #    print doc

        # 查询相似度
        index = similarities.MatrixSimilarity(corpus_tfidf)

        # 将查询文档转到tfidf
        sample_tfidf = tfidf[dictionary.doc2bow(jieba.lcut(sample, cut_all=False))]
        sims = index[sample_tfidf]
        # print list(enumerate(sims))

        # 排序输出,查询文档和0,7,2,3号文档相似性较高
        sims = sorted(enumerate(sims), key=lambda item: item[1], reverse=True)
        # print sims
        # print sims[0]
        (x, y) = sims[0]
        # print file
        return lines[x]
Пример #11
0
 def result_by_BM25(self, sentence):
     seg_list = jieba.lcut(sentence, cut_all=False)
     n, cleaned_dict = self.clean_list(seg_list)
     BM25_scores = {}
     for term in cleaned_dict.keys():
         r = self.fetch_from_db(term)
         if r is None:
             continue
         df = r[1]
         w = math.log2((self.N - df + 0.5) / (df + 0.5))
         docs = r[2].split('\n')
         for doc in docs:
             docid, date_time, tf, ld = doc.split('\t')
             docid = int(docid)
             tf = int(tf)
             ld = int(ld)
             s = (self.K1 * tf * w) / (tf + self.K1 * (1 - self.B + self.B * ld / self.AVG_L))
             if docid in BM25_scores:
                 BM25_scores[docid] = BM25_scores[docid] + s
             else:
                 BM25_scores[docid] = s
     BM25_scores = sorted(BM25_scores.items(), key = operator.itemgetter(1))
     BM25_scores.reverse()
     if len(BM25_scores) == 0:
         return 0, []
     else:
         return 1, BM25_scores
Пример #12
0
def __clean_content(raw):
    """
    文本清洗
    :param raw:
    :return:
    """
    # 过滤空白字符
    # print raw
    blank_re = re.compile(r"\s+")
    temp_raw = map(lambda x: x[1:], raw)
    # print temp_raw
    texts_raw = [map(lambda x: blank_re.split(x), ele) for ele in temp_raw]
    # return texts_raw

    # 分词
    texts_splited = [[reduce(lambda x, y: x + jieba.lcut(y), text, []) for text in temp_list] for temp_list in texts_raw]

    # 过滤停用词
    texts_filtered = [[[word for word in text if word not in data.stopwords] for text in temp_filt] for temp_filt in texts_splited]

    # 过滤低频词
    # all_words = sum(texts_filtered, [])
    # words_once = {word for word in set(all_words) if all_words.count(word) == 1}
    # texts = [[word for word in text if word not in words_once] for text in texts_filtered]
    return texts_filtered
def format_text(args, stop_words):

    for line in args:
        temp = map(etl, jieba.lcut(line[2].lower()))
        yield filter(lambda word: (len(word)) > 0 and
                                  (word not in stop_words), temp)
    cursor.close()
Пример #14
0
def get_data_and_write(filein, fileout):
    """用 readline 逐行读入数据并 unpack

    可用适当参数输出
    """
    t = re.compile('\t')
    time_sep = re.compile('-')

    temp = []
    for line in filein: # write number of users as demand in num_user   
        (uid, mid, time, forward_count,
        comment_count, like_count, content) = t.split(line)
        #yyyy, mm, dd = time_sep.split(time)
        #print(yyyy, mm, dd)
        forward_count = int(forward_count)
        comment_count = int(comment_count)
        like_count = int(like_count)
        cut_list = jieba.lcut(content)
        #print(cut_list)
        if forward_count == 0 and comment_count == 0 and like_count == 0:
            fileout000.write('000 '+ ' '.join(cut_list))
        else:             
            if forward_count != 0:
                fileout100.write('100 '+ ' '.join(cut_list))
            if comment_count != 0:
                fileout010.write('010 '+ ' '.join(cut_list))
            if like_count != 0:
                fileout001.write('001 '+ ' '.join(cut_list))
Пример #15
0
def checking_sentiment(_text):
    """
    对传入字符串进行情感词提取,及情感极性判别
    返回tuple类型,分别为
    (正面词汇,下面词汇数量,负面词汇,负面词汇数量,情感极性差别)
    :param _text:
    """

    def good_or_bad(lg, lb, score_ok):
        gi = len(lg)
        bi = len(lb)
        if gi == 0 and bi == 0:
            return 0
        elif gi == 0 and bi != 0:
            return -1
        elif gi != 0 and bi == 0:
            return 1
        elif abs(gi - bi) <= score_ok:
            return 0
        elif gi > bi - score_ok:
            return 1
        else:
            return -1

    g = list()  # 正面
    b = list()  # 负面
    fenchi = jieba.lcut(_text)

    for c in fenchi:
        chi = c.encode('utf8')
        if _sentiment_dict.get(chi) == '1':
            g.append(chi)
        elif _sentiment_dict.get(chi) == '-1':
            b.append(chi)
    return ' '.join(g), len(g), ' '.join(b), len(b), good_or_bad(g, b, 1)
Пример #16
0
 def result_by_hot(self, sentence):
     seg_list = jieba.lcut(sentence, cut_all=False)
     n, cleaned_dict = self.clean_list(seg_list)
     hot_scores = {}
     for term in cleaned_dict.keys():
         r = self.fetch_from_db(term)
         if r is None:
             continue
         df = r[1]
         w = math.log2((self.N - df + 0.5) / (df + 0.5))
         docs = r[2].split('\n')
         for doc in docs:
             docid, date_time, tf, ld = doc.split('\t')
             docid = int(docid)
             tf = int(tf)
             ld = int(ld)
             news_datetime = datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S")
             now_datetime = datetime.now()
             td = now_datetime - news_datetime
             BM25_score = (self.K1 * tf * w) / (tf + self.K1 * (1 - self.B + self.B * ld / self.AVG_L))
             td = (timedelta.total_seconds(td) / 3600) # hour
             hot_score = math.log(BM25_score) + 1 / td
             if docid in hot_scores:
                 hot_scores[docid] = hot_scores[docid] + hot_score
             else:
                 hot_scores[docid] = hot_score
     hot_scores = sorted(hot_scores.items(), key = operator.itemgetter(1))
     hot_scores.reverse()
     if len(hot_scores) == 0:
         return 0, []
     else:
         return 1, hot_scores
Пример #17
0
def cutwords(filename):
    lines = cutlines(filename)
    print lines
    words = []
    for line in lines:
        words.append(jieba.lcut(line, cut_all=False))
    return words
def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        sentence = sentence.decode("utf8")
        sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。:??、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"),
                          sentence)
        yield list(jieba.lcut(sentence))
Пример #19
0
 def construct_postings_lists(self):
     config = configparser.ConfigParser()
     config.read(self.config_path, self.config_encoding)
     files = listdir(config['DEFAULT']['doc_dir_path'])
     AVG_L = 0
     for i in files:
         root = ET.parse(config['DEFAULT']['doc_dir_path'] + i).getroot()
         title = root.find('title').text
         body = root.find('body').text
         docid = int(root.find('id').text)
         date_time = root.find('datetime').text
         seg_list = jieba.lcut(title + '。' + body, cut_all=False)
         
         ld, cleaned_dict = self.clean_list(seg_list)
         
         AVG_L = AVG_L + ld
         
         for key, value in cleaned_dict.items():
             d = Doc(docid, date_time, value, ld)
             if key in self.postings_lists:
                 self.postings_lists[key][0] = self.postings_lists[key][0] + 1 # df++
                 self.postings_lists[key][1].append(d)
             else:
                 self.postings_lists[key] = [1, [d]] # [df, [Doc]]
     AVG_L = AVG_L / len(files)
     config.set('DEFAULT', 'N', str(len(files)))
     config.set('DEFAULT', 'avg_l', str(AVG_L))
     with open(self.config_path, 'w', encoding = self.config_encoding) as configfile:
         config.write(configfile)
     self.write_postings_to_db(config['DEFAULT']['db_path'])
Пример #20
0
	def gen_idf_file(self):
		news_data = []
		with open(self.doc_dir_path , encoding = 'utf-8') as f:
			for line in f:
				news_data.append(json.loads(line))

		n = float(len(news_data))
		idf = {}
		for i in range(0, len(news_data)):
			keyword = ' '.join(news_data[i]['keyword'])
			title = news_data[i]['title']
			body = news_data[i]['content']

			seg_list = jieba.lcut(title + '。' + keyword + body, cut_all = False)
			seg_list = set(seg_list) - self.stop_words # 去除停用词
			for word in seg_list:
				word = word.strip().lower()
				if word == '' or self.is_number(word):
					continue
				if word not in idf:
					idf[word] = 1
				else:
					idf[word] = idf[word] + 1
		idf_file = open(self.idf_path, 'w', encoding = 'utf-8')
		for word, df in idf.items():
			idf_file.write('%s %.9f\n'%(word, math.log(n / df)))
		idf_file.close()
Пример #21
0
def preprocess(in_filename, out_filename):
	in_file = open(in_filename, 'r')
	out_file = open(out_filename,'a')
	while True:
		line = in_file.readline()

		if not line:
			break
		line = line.strip()
		if not line:
			continue
		#sep by punctuation
		#sentences = re.split('《', line)
		#sentences = re.split('。|?|!|\.|-|:| |(|)', line)
		sentence = line
		#for sentence in sentences:
		#	if not sentence:
		#		continue
		cut_res = jieba.lcut(sentence, cut_all=True)
		remove_stopwords(cut_res)
		if not cut_res:
			continue
		new_line = ' '.join(cut_res)+'\n'
		out_file.write(new_line)

	in_file.close()
	out_file.close()	
Пример #22
0
def spritz(language="chinese"):
    if language == 'english':
        f = open('./app/static/file/article_eg.txt','r')
        article = f.read()
        f.close()
        paragraghs = article.split("\n")
        words = []
        for para in paragraghs:
            words.extend(para.split(' '))
        return render_template('spritz.html', paragraghs=paragraghs, words=words, length=len(words), version='en' )
    if language == 'chinese':
        f = open('./app/static/file/article_ch.txt','r')
        article = f.read()
        f.close()
        article = article.decode('utf-8')
        paragraghs = article.split("\n")
        re_attach_article = ""
        for p in paragraghs:
            re_attach_article += p
        words = jieba.lcut(re_attach_article)
        return render_template('spritz.html', paragraghs=paragraghs, words=words, length=len(words), version='ch' )
    if language == 'chinese_led':
        f = open('./app/static/file/article_ch.txt','r')
        article = f.read()
        f.close()
        clean_article = article.replace(' ','')
        clean_article = clean_article.replace('\n', '')
        clean_article = clean_article.decode('utf-8')
        article = article.decode('utf-8')
        paragraghs = article.split("\n")
        words = list(clean_article)
        return render_template('spritz.html', paragraghs=paragraghs, words=words, length=len(words), version='led')
Пример #23
0
def filter_word_by_jieba(text):
    """利用分词过滤常用词,例如语气词等"""
    tmp_segs = jieba.lcut(text, cut_all=False, HMM=False)
    
    ssegs,tmp = [],[]
    for seg in tmp_segs:
        if seg == ".":
            ssegs.append(tmp)
            tmp = []
        else:
            tmp.append(seg)
    
    data = []
    
    def part_by_part(segs):
        tmp = ""
        for idx, seg in enumerate(segs):
            if len(seg)==1: tmp += seg
            if (idx==len(segs)-1 or len(segs[idx+1])>1) and len(tmp)>1:
                data.append(tmp)
                tmp = ""
    for ss in ssegs:
        part_by_part(ss)
    
    return data
def rewrite2wordlist(inputFileName,outputFileName,haslabel=True,maxLine=-1):
    fin=codecs.open(inputFileName,'r','utf-8')
    fout=codecs.open(outputFileName,'w','utf-8')
    if haslabel:
        foutlb=codecs.open(outputFileName+u'.lb','w','utf-8')
    import jieba
    lcount=0
    while True:
        line=fin.readline()
        if not line:break
        line=line.strip()
        sentance=line
        if haslabel:
            lb=line.split(u' ')[0]
            sentance= u' '.join(line.split(u' ')[1:])

        while u'  ' in sentance:
            sentance=sentance.replace(u'  ', u' ')
        if sentance.endswith(u'None'): continue
        if len(sentance)<4:continue
        fout.write(lb+u' '+u' '.join(jieba.lcut(sentance)) + u'\n')
        if haslabel:
            foutlb.write(lb+u'\n')
        lcount+=1
        if lcount%5000==0:
            print lcount
        if maxLine!=-1:
            if lcount > maxLine:
                break
    fin.close()
    fout.close()
    if haslabel:
        foutlb.close()
def load_data_and_multilabels():
    """
    5 kinds of company name
    """
    # Load data from files
    positive_examples = list(open("normal.csv").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open("rubbish.csv").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    invest_examples = list(open("invest.csv").readlines())
    invest_examples = [s.strip() for s in invest_examples]
    sci_examples = list(open("science.csv").readlines())
    sci_examples = [s.strip() for s in sci_examples]
    serv_examples = list(open("service.csv").readlines())
    serv_examples = [s.strip() for s in serv_examples]
    # Split by words
    x_text = positive_examples + negative_examples+invest_examples+sci_examples+serv_examples
    x_text = [clean_str(sent) for sent in x_text]
    x_text = [jieba.lcut(s) for s in x_text]
    # Generate labels
    positive_labels = [1 for _ in positive_examples]
    negative_labels = [0 for _ in negative_examples]
    invest_labels = [2 for _ in invest_examples]
    sci_labels = [3 for _ in sci_examples]
    serv_labels = [4 for _ in serv_examples]
    y = np.concatenate([positive_labels, negative_labels, invest_labels, sci_labels, serv_labels], 0)
    return [x_text, y]
Пример #26
0
def tokenizer(text):
    ''' Simple Parser converting each document to lower-case, then
        removing the breaks for new lines and finally splitting on the
        whitespace
    '''
    text = [jieba.lcut(document.replace('\n', '')) for document in text]
    return text
Пример #27
0
def jieba_example():
    raw = "我爱北京S5天安门(,123,三,四"
    raw_seq = jieba.cut(raw)
    raw_seq_list = jieba.lcut(raw)
    raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
    raw_with_ictclas = pseg.cut(raw)
    for word, flag in raw_with_ictclas:
        print word, flag
Пример #28
0
	def get_phrase_answers(self, answers):
		"""Get phrase answers and append them to the overall list of answers"""
		if self.query["phrase"]:
			for phrase in self.query["phrase"]:
				phlist = jieba.lcut(phrase, cut_all = False)
				for term in phlist:
					answers.append(term)
		return answers
Пример #29
0
def String_make_corpus(text):
    corpus = ""
    if isinstance(text, basestring):
        words_seg = jieba.lcut(text)
        for i in range(len(words_seg)):
            words_seg[i] = words_seg[i].encode('utf-8')
        corpus = ' '.join(words_seg)
    return corpus
Пример #30
0
	def get_boolean_answers(self, answers):
		"""Get boolean answers and append them to the overall list of answers"""
		if self.query["bool"]:
			for sentence in self.query["bool"]:
				boolist = jieba.lcut(sentence, cut_all = False)
				for term in boolist:
					answers.append(term)
		return answers
Пример #31
0
        v = np.zeros(200)
        for word in words:
            v += model[word]
        v /= len(words)
        return v

    v1, v2 = sentence_vector(s1), sentence_vector(s2)
    return np.dot(v1, v2) / (norm(v1) * norm(v2))


#sentence1="抢劫"
sentence1 = "渔船用5厘米尺寸的渔网捕鱼"
sentence2 = "使用炸鱼、毒鱼、电鱼等破坏渔业资源方法进行捕捞"
##sentence2=sentence2.replace(',', '').replace('。', '').replace('?', '').replace('!', '') \
##        .replace('“', '').replace('”', '').replace(':', '').replace('…', '').replace('(', '').replace(')', '') \
##        .replace('—', '').replace('《', '').replace('》', '').replace('、', '').replace('‘', '') \
##        .replace('’', '').replace(' ','')    # 去掉标点符号
sentence3 = "抢夺公私财物"
sentence4 = "使用小于最小网目尺寸的网具进行捕捞"

words = jieba.lcut(sentence3, cut_all=False)

#print(words)
##
##words2=[word  for word in words if word not in "、|,"]
##print(words2)

print(vector_similarity(sentence1, sentence2))
print(vector_similarity(sentence1, sentence3))
print(vector_similarity(sentence1, sentence4))
Пример #32
0
#例子6-1:将文本句子 “2018年世界杯小组赛抽签在莫斯科克里姆林宫举行”进行分词。

import jieba
print(jieba.lcut("2018年世界杯小组赛抽签在莫斯科克里姆林宫举行"))
print(jieba.lcut("2018年世界杯小组赛抽签在莫斯科克里姆林宫举行", cut_all=True))
print(jieba.lcut_for_search("2018年世界杯小组赛抽签在莫斯科克里姆林宫举行"))

# 例6-2:显示例6-1分词后每个词的词性。
import jieba.posseg as pseg
words = pseg.cut("2018年世界杯小组赛抽签在莫斯科克里姆林宫举行")
for word, tag in words:
    print('word:{}, tag:{}'.format(word, tag))
#GovRptWordCloudv2T1.py
import matplotlib.pyplot as plt
from PIL import Image
import jieba
import wordcloud
#from scipy.misc import imread
import imageio
mask = imageio.imread("fivestart.png")
excludes = {}
f = open("新时代中国特色社会主义.txt", "r", encoding="UTF-8")
t = f.read()
f.close()
ls = jieba.lcut(t)
txt = " ".join(ls)
w = wordcloud.WordCloud(\
                        width = 1000, height = 700,\
                        background_color = "white",
                        font_path = "Hiragino Sans GB.ttc", mask = mask
                        )
w.generate(txt)
w.to_file("grwordcloud.png")

plt.im = Image.open("grwordcloud.png")
plt.im.show()
Пример #34
0
def predict_one(s):
    s = clean_text(s)
    s = np.array(doc2num(jieba.lcut(s), MAX_SEQUENCE_LENGTH))
    s = s.reshape((1, s.shape[0]))
    return model.predict_classes(s, verbose=0)[0][0]
Пример #35
0
def fenci(x):
    words = jieba.lcut(x)
    return " ".join(words)
Пример #36
0
import jieba
s = "世界冠军运动员的乒乓球拍卖完了"
ls = jieba.lcut(s, True)
print(ls)
Пример #37
0
def clean_cut(s):
    # 要将文本清洗、分词并以空格隔开,以便转为Tfidf向量表征
    # 清洗过程中发现有些文本还有\\n,也一并删除 
    return ' '.join(jieba.lcut(re.sub('[\r\n\u3000]', '', s).replace('\\n','')))
Пример #38
0
    def test_process_hits(self, sess, data, args):

        with open(os.path.join(args.datapath, 'test_distractors.json'),
                  'r',
                  encoding='utf8') as f:
            test_distractors = json.load(f)

        data.restart("test", batch_size=1, shuffle=False)
        batched_data = data.get_next_batch("test")

        loss_record = []
        cnt = 0
        while batched_data != None:

            for key in batched_data:
                if isinstance(batched_data[key], np.ndarray):
                    batched_data[key] = batched_data[key].tolist()

            batched_data['resp_length'] = [len(batched_data['resp'][0])]
            for each_resp in test_distractors[cnt]:
                batched_data['resp'].append(
                    [data.go_id] +
                    data.convert_tokens_to_ids(jieba.lcut(each_resp)) +
                    [data.eos_id])
                batched_data['resp_length'].append(
                    len(batched_data['resp'][-1]))
            max_length = max(batched_data['resp_length'])
            resp = np.zeros((len(batched_data['resp']), max_length), dtype=int)
            for i, each_resp in enumerate(batched_data['resp']):
                resp[i, :len(each_resp)] = each_resp
            batched_data['resp'] = resp

            post = []
            post_length = []
            prev_length = []

            kg = []
            kg_h_length = []
            kg_hr_length = []
            kg_hrt_length = []
            kg_index = []

            for _ in range(len(resp)):
                post += batched_data['post']
                post_length += batched_data['post_length']
                prev_length += batched_data['prev_length']

                kg += batched_data['kg']
                kg_h_length += batched_data['kg_h_length']
                kg_hr_length += batched_data['kg_hr_length']
                kg_hrt_length += batched_data['kg_hrt_length']
                kg_index += batched_data['kg_index']

            batched_data['post'] = post
            batched_data['post_length'] = post_length
            batched_data['prev_length'] = prev_length

            batched_data['kg'] = kg
            batched_data['kg_h_length'] = kg_h_length
            batched_data['kg_hr_length'] = kg_hr_length
            batched_data['kg_hrt_length'] = kg_hrt_length
            batched_data['kg_index'] = kg_index

            _, _, loss, _, _ = self.inference(sess,
                                              batched_data,
                                              lamb=args.lamb)
            loss_record.append(loss)
            cnt += 1
            batched_data = data.get_next_batch("test")

        assert cnt == len(test_distractors)

        loss = np.array(loss_record)
        loss_rank = np.argsort(loss, axis=1)
        hits1 = float(np.mean(loss_rank[:, 0] == 0))
        hits3 = float(np.mean(np.min(loss_rank[:, :3], axis=1) == 0))
        hits5 = float(np.mean(np.min(loss_rank[:, :5], axis=1) == 0))
        return {'hits@1': hits1, 'hits@3': hits3, 'hits@5': hits5}
Пример #39
0
 def cut_sentences(self, text):
     """文本分句,然后分词"""
     sentences = re.findall(".*?[。?!]", text)
     cut_sentences = [jieba.lcut(sent) for sent in sentences]
     return cut_sentences
Пример #40
0
import jieba

from pyecharts import options as opts
from pyecharts.charts import WordCloud

story = '''
实打实打算阿斯顿阿三爱上倒是都i阿松嗲送i的厚爱收到安抚和速度放缓阿斯顿法海速度回复哎u收到饭后阿萨的哈佛i啊u是和覅u送达和覅u阿斯顿哈佛i撒旦和覅说的话弗兰克世界大会风口浪尖撒旦和李开复盛大开放哈桑了艰苦奋斗
'''

words = jieba.lcut(story)
counts = {}
for w in words:
    if len(w) == 1:
        continue
    else:
        counts[w] = counts.get(w, 0) + 1
items = list(counts.items())

words = [(k, v) for (k, v) in items]

(WordCloud().add(
    series_name="MyLove", data_pair=words,
    word_size_range=[6, 66]).set_global_opts(
        title_opts=opts.TitleOpts(
            title="MyLove",
            title_textstyle_opts=opts.TextStyleOpts(font_size=23)),
        tooltip_opts=opts.TooltipOpts(is_show=True),
    ).render("MyLove.html"))
Пример #41
0
def tokenize(sequence):
    word_list = jieba.lcut(sequence)
    return word_list
Пример #42
0
'''
wordcloud
pip3 install wordcloud
w=wordcloud.WordCloud()
w=wordcloud.WordCloud(width=400)
w=wordcloud.WordCloud(height=200)
w=wordcloud.WordCloud(min_font_size=4)
w=wordcloud.WordCloud(max_font_size)
w=wordcloud.WordCloud(font_size=1)
w=wordcloud.WordCloud(font_path=None)
w=wordcloud.WordCloud(max_words=200)
w=wordcloud.WordCloud(stop_words={""})
mask
	from scipy.misc import imread
	mk=imread("pic.png")
	w=wordcloud.WordCloud(mask=mk)
w=wordcloud.WordCloud(background_color="black")
w.generate(txt)
w.to_file(filename) .png .jpg
'''
import jieba
import wordcloud

txt = "程序设计语言是计算机能够理解和识别用户操作意图的一种交互体系,它按照特定规则组织计算机指令,使计算机能够自动进行各种运算处理"
w = wordcloud.WordCloud(width=1000, height=700, font_path="msyh.ttf")
w.generate(' '.join(jieba.lcut(txt)))
w.to_file("pywordcloud.png")
Пример #43
0
#    with open("6第六周\沉默的羔羊.txt","r",encoding="utf-8") as f:
#        txt=f.read()
#        return txt
#words=jieba.lcut(function())
#counts={}
#for x in words:
#    if len(x)>2:
#        counts[x]=counts.get(x,0)+1
#items=list(counts.items())
#items.sort(key=lambda x:x[1],reverse=True)
#for i in range(1):
#    word, count = items[i]
#    print ("{}".format(word))

import jieba
f = open("6第六周\沉默的羔羊.txt", "r", encoding="utf-8")
ls = jieba.lcut(f.read())
#ls = f.read().split()
d = {}
for w in ls:
    d[w] = d.get(w, 0) + 1
maxc = 0
maxw = ""
for k in d:
    if d[k] > maxc and len(k) > 2:
        maxc = d[k]
        maxw = k
    if d[k] == maxc and len(k) > 2 and k > maxw:
        maxw = k
print(maxw)
f.close()
Пример #44
0
# -*- coding: utf-8 -*- 
"""
Project: PyLib
Author: Jarod
Create time: 2020-04-15 12:02
IDE: PyCharm
Introduction:
"""

import jieba
import wordcloud

'''
txt = "life is short, you need python"
w = wordcloud.WordCloud(background_color="white")
w.generate(txt)
w.to_file("pywordcloud.png")
'''
txt = "程序设计语言是计算机能顾理解和识别用户操作意图的一种交互体系,它按照特定规则组织计算机指令,是计算机能够自动运行各种运算处理。"
w = wordcloud.WordCloud(width=1000, height=700, font_path="msyh.ttc")

w.generate(" ".join(jieba.lcut(txt)))
w.to_file("pywordcloud.png")
Пример #45
0
    _context, _trigger, _object, _subject, _time, _location = \
        item[1].replace("-", "-").replace("~", "~"), item[2], item[3].replace("-", "-"), item[4].replace("-", "-"), \
        item[5].replace("-", "-"), item[6].replace("-", "-")
    # 特殊化处理(仅仅训练集存在这种情况->将所有的变种0~9进行替换)
    for i in range(10):
        r_c = chr(65296 + i)
        _context = _context.replace(r_c, "%d" % i)
        _trigger = _trigger.replace(r_c, "%d" % i)
        _object = _object.replace(r_c, "%d" % i)
        _subject = _subject.replace(r_c, "%d" % i)
        _time = _time.replace(r_c, "%d" % i)
        _location = _location.replace(r_c, "%d" % i)
    trigger_index = len(obj)
    # 首先处理triggers
    x = list(jieba.tokenize(_context))  # 切词带索引
    y = jieba.lcut(_context)  # 单纯的切词序列
    assert len(x) == len(y)

    overlap_flag = False
    __context = ""
    overlap_index = -1

    for i in range(trigger_index):
        if obj[trigger_index - 1 -
               i]["trigger"] == _trigger or _trigger in obj[trigger_index - 1 -
                                                            i]["trigger"]:
            overlap_flag = True
            x = list(
                jieba.tokenize(_context[obj[trigger_index - 1 - i]["end"] +
                                        1:]))
            y = jieba.lcut(_context[obj[trigger_index - 1 - i]["end"] + 1:])
Пример #46
0
def tokenizer(text):
    text = [jieba.lcut(document.replace('\n', '')) for document in text]
    return text
            sleep_num1 = random.randint(90, 120)
            time.sleep(sleep_num1)
            try:
                print("正在获取请求......")
                response = requests.get(url=url,
                                        headers=headers,
                                        params=params)
                print("获取请求成功!")
                response.encoding = 'utf-8'
                page_text = response.text
                r_ex = '<div class="c-abstract">(.*?)</div>'
                content_list = re.findall(r_ex, page_text, re.S)
                print('content_list:', end=' ')
                print(content_list)
                for content in content_list:
                    content_fenci = jieba.lcut(content, cut_all=True)
                    print('content_fenci:', end=' ')
                    print(content_fenci)
                    for fenci in content_fenci:
                        if fenci == kw:
                            kw_cnt = kw_cnt + 1
                    ent_kw_cnt[loop_cnt] = kw_cnt
                    print(kw + "出现频次:", end=' ')
                    print(kw_cnt)
            except requests.exceptions.ConnectionError:
                print("------这一条爬取失败,休息3分钟-----")
                time.sleep(180)
                continue
        loop_cnt = loop_cnt + 1

    if ent_kw_cnt[0] != "卫星石化":
Пример #48
0
# wordCloudthreeKingdoms.py

import wordcloud
import jieba

# 读取txt
threeKingdomsTxt = open("E:\\21_git\\pythonMOOC\\week6\\threekingdoms.txt", \
                            "rt", encoding = 'utf-8').read()
# 利用jieba库 分词 【√】
words = jieba.lcut(threeKingdomsTxt)
for word in words:
    if len(word) == 1:
        del word
# 转化为以空格分隔的字符串
txt = " ".join(words)

# 生成词云
wc = wordcloud.WordCloud(width = 1920, height = 1080, font_path="msyh.ttc", \
                            background_color = 'white') # 注意需要加载中文字体
wc.generate(txt)
wc.to_file('wordCloudthreeKingdoms.png')
Пример #49
0
#jieba加载自定义的词典;
root = os.path.dirname(os.path.realpath(__file__))
my_dict = root+'/dict/mydict.txt'
template = root+'/dict/template.txt'

#模版列表的dict
ltemplate = []
fdatas = open(template , 'r', encoding='utf_8').readlines()
for line in fdatas:
    sdict = {}
    sline = line.split("|")
    sdict["sname"] = sline[0]
    sdict["stag"] = sline[1]
    sdict["sentence"] = sline[2]
    #把分词后的内容也存入到dict中;
    sn = jieba.lcut(sline[2],cut_all=False)
    sdict["sn"] = sn
    ltemplate.append(sdict)

def get_catalog(sentence):

    #首先需要对于sentence内容做切割
    asentence = re.split("。|,|?|\.|\?|,", sentence)
    max = 0
    catalog = []
    for s in asentence:
        for t1 in ltemplate:
            sn = t1["sn"]
            diff = count_distance(s, sn)
            if diff > max:
                max = diff
Пример #50
0
def tokenize(line, is_zh):
    tokens = jieba.lcut(line) if is_zh else nltk.word_tokenize(line)
    return ' '.join(tokens), len(tokens)
Пример #51
0
def tokenize(sent):
    # with open(stop_words_path, "r", encoding="utf-8") as sw:
    #     stop_words_ = sw.readlines()
    # stop_words = [w.replace("\n", "") for w in stop_words_]
    return ' '.join([w for w in jieba.lcut(clean(sent.strip()))])
Пример #52
0
def cut(string):
    return jieba.lcut(string)
Пример #53
0
def sentiment_score_list(seg_sentence):
    # seg_sentence = dataset.split('。')
    words = []
    count1 = []
    count2 = []
    senti_score_words_result = {"count2": count2, "words": words}
    for index, sen in enumerate(seg_sentence):  #循环遍历每一个评论
        # print index
        if not sen:
            continue
        segtmp = jieba.lcut(sen, cut_all=False)  #把句子进行分词,以列表的形式返回
        # 把所有的分词加到总list中
        words.extend(segtmp)
        i = 0  #记录扫描到的词的位置
        a = 0  #记录情感词的位置
        poscount = 0  #积极词的第一次分值
        poscount2 = 0  #积极词反转后的分值
        poscount3 = 0  #积极词的最后分值(包括叹号的分值)
        negcount = 0
        negcount2 = 0
        negcount3 = 0
        for word in segtmp:
            word = word.encode("utf-8")
            if word in posdict:  # 判断词语是否是情感词
                poscount += 1
                c = 0
                for w in segtmp[a:i]:  # 扫描情感词前的程度词
                    w = w.encode("utf-8")
                    if w in mostdict:
                        poscount *= 4.0
                    elif w in verydict:
                        poscount *= 3.0
                    elif w in moredict:
                        poscount *= 2.0
                    elif w in ishdict:
                        poscount *= 0.5
                    elif w in deny_word:
                        c += 1
                if judgeodd(c) == 'odd':  # 扫描情感词前的否定词数
                    poscount *= -1.0
                    poscount2 += poscount
                    poscount = 0
                    poscount3 = poscount + poscount2 + poscount3
                    poscount2 = 0
                else:
                    poscount3 = poscount + poscount2 + poscount3
                    poscount = 0
                a = i + 1  # 情感词的位置变化

            elif word in negdict:  # 消极情感的分析,与上面一致
                negcount += 1
                d = 0
                for w in segtmp[a:i]:
                    w = w.encode("utf-8")
                    if w in mostdict:
                        negcount *= 4.0
                    elif w in verydict:
                        negcount *= 3.0
                    elif w in moredict:
                        negcount *= 2.0
                    elif w in ishdict:
                        negcount *= 0.5
                    elif w in degree_word:
                        d += 1
                if judgeodd(d) == 'odd':
                    negcount *= -1.0
                    negcount2 += negcount
                    negcount = 0
                    negcount3 = negcount + negcount2 + negcount3
                    negcount2 = 0
                else:
                    negcount3 = negcount + negcount2 + negcount3
                    negcount = 0
                a = i + 1
            elif word == '!' or word == '!':  ##判断句子是否有感叹号
                for w2 in segtmp[::-1]:  # 扫描感叹号前的情感词,发现后权值+2,然后退出循环
                    if w2 in posdict or negdict:
                        poscount3 += 2
                        negcount3 += 2
                        break
            i += 1  # 扫描词位置前移

            # 以下是防止出现负数的情况
            pos_count = 0
            neg_count = 0
            if poscount3 < 0 and negcount3 > 0:
                neg_count += negcount3 - poscount3
                pos_count = 0
            elif negcount3 < 0 and poscount3 > 0:
                pos_count = poscount3 - negcount3
                neg_count = 0
            elif poscount3 < 0 and negcount3 < 0:
                neg_count = -poscount3
                pos_count = -negcount3
            else:
                pos_count = poscount3
                neg_count = negcount3

            count1.append([pos_count, neg_count])
        count2.append(count1)
        count1 = []

    return senti_score_words_result
Пример #54
0
    def train(self, datas, labels, **kwargs):

        if 'retrain' in kwargs and kwargs['retrain'] is True:
            Utils.remove_models(self.model_dir)

        self.datas = datas
        self.labels = labels

        # 加载之前的模型和步数
        self.model, self.start_epoch = Utils.load_previous_model(
            self.model_dir)
        lang = Lang()
        all_labels = []
        for idx in range(len(datas)):
            data = datas[idx]
            if type(data) == str:
                data_list = jieba.lcut(data)
            lang.index_words(data)
            if labels[idx] not in all_labels:
                all_labels.append(labels[idx])

        if self.kwargs['word2vec']:
            if self.kwargs['word2vec_model'] == None:
                raise ValueError('请填写word2vec模型存储路径')
            self.dataset = Dataset(
                datas,
                labels,
                lang,
                all_labels,
                batch_size=self.kwargs['batch_size'],
                word2vec=True,
                word2vec_model=self.kwargs['word2vec_model'])
            if self.model == None:
                self.kwargs['word_embedding'] = self.dataset.word_embedding
                self.kwargs['embed_size'] = self.dataset.embed_size
        else:
            self.dataset = Dataset(datas,
                                   labels,
                                   lang,
                                   all_labels,
                                   batch_size=self.kwargs['batch_size'],
                                   word2vec=False)

        param = self.kwargs
        param['dataset'] = self.dataset
        Utils.save_param(param, self.model_dir)

        if self.model == None:
            self.kwargs['input_size'] = lang.n_words
            self.kwargs['output_size'] = len(all_labels)
            self.model = BiGRUNet(self.kwargs)

        self.criterion = torch.nn.NLLLoss()
        self.optimizer = torch.optim.Adam(self.model.parameters(),
                                          lr=self.kwargs['lr_rate'])

        self.n_epochs = len(labels) * self.kwargs['epoch']
        self.progress = ProgressBar(count=self.start_epoch,
                                    total=self.n_epochs + 1)

        self.train_iter()
Пример #55
0
import jieba
from wordcloud import WordCloud

# 排除词列表,剔除无效词语
excludes = {
    "div", "h5", "class", "h4", "h3", "h2", "h1", "conlun2_box_text", "p", "id"
}
# 以只读模式打开刚爬取的txt文档
f = open(
    '/home/mark/Documents/Code-insider/Unnamed/Udemy/Python/Grawl_analyse_report/reports/2014_gov_report.txt',
    'r',
    encoding='utf-8')
txt = f.read()
f.close()

words = jieba.lcut(txt)  # 使用精确分词模式进行分词
newtxt = ''.join(words)  # 利用空格连接精确分词后的词语
# 对词云图参数进行设置,以newtxt文本生成词云
wordcloud = WordCloud(
    background_color="white",  # 设置图片背景颜色
    width=800,  # 设置图片宽度
    height=600,  # 设置图片高度
    # 指定字体文件的完整路径,如果不设置可能显示不了中文
    font_path="/usr/share/fonts/opentype/noto/NotoSansCJK-Regular.ttc",
    max_words=100,  # 词云中最大词数
    max_font_size=80,  # 词云中最大的字体号数
    stopwords=excludes,  # 被排除的词列表
).generate(newtxt)
wordcloud.to_file('Wordcloud_Analysis.png')
Пример #56
0
def cut_word():
    import jieba
    jieba.load_userdict(os.path.join(Data_PATH, "word_dict.txt"))
    jieba.add_word('_e11_')
    jieba.add_word('_e12_')
    jieba.add_word('_e21_')
    jieba.add_word('_e22_')
    for en_word in ['disease','reason','symptom','test','test_value','drug','frequency','amount','method','treatment',
                    'operation','sideeff','anatomy','level','duration']:
        jieba.add_word(en_word)

    for file_name in ['sample_negative.txt', 'sample_positive.txt']:
        with open(os.path.join(Data_PATH, file_name.replace('.txt', '_cut.txt')), 'w') as fout:
            with open(os.path.join(Data_PATH, file_name)) as f:
                for line in f:
                    r_name, sent, fn, rid = line.rstrip().split('\t')
                    if any(a in sent for a in ['_e11_', '_e12_', '_e21_', '_e22_']):
                        print('contains new annotation !!!')

                    # if not (fn == '131_5' and rid == 'R21'):
                    #     continue
                    # e1_s = sent.index('<e1>')
                    # e1_e = sent.index('</e1>')
                    # e2_s = sent.index('<e2>')
                    # e2_e = sent.index('</e2>')

                    sent_simp = sent.replace('<e1>', '_e11_').replace('</e1>', '_e12_').replace('<e2>', '_e21_').replace('</e2>', '_e22_')

                    sent_cut = jieba.lcut(sent_simp, HMM=False)
                    # i = 0
                    # offset = []
                    # for w in sent_cut:
                    #     offset.append([i, i+len(w), w])
                    #     i += len(w)
                    #
                    # if e1_s < e2_s:
                    #
                    #     for idx, (s_idx, e_idx, w) in enumerate(offset):
                    #         if e1_s == 0:
                    #             offset.insert(0, [e1_s, e1_s + 4, '<e1>'])
                    #             offset = offset[:1] + [[si+4, ei+4, w] for si, ei, w in offset[1:]]
                    #             break
                    #
                    #         if e_idx == e1_s:
                    #             offset.insert(idx+1, [e1_s, e1_s + 4, '<e1>'])
                    #             offset = offset[:idx+2] + [[si+4, ei+4, w] for si, ei, w in offset[idx+2:]]
                    #             break
                    #
                    #     for idx, (s_idx, e_idx, w) in enumerate(offset):
                    #         if e_idx == e1_e:
                    #             offset.insert(idx+1, [e1_e, e1_e + 5, '</e1>'])
                    #             offset = offset[:idx+2] + [[si+5, ei+5, w] for si, ei, w in offset[idx+2:]]
                    #             break
                    #
                    #     for idx, (s_idx, e_idx, w) in enumerate(offset):
                    #         if e_idx == e2_s:
                    #             offset.insert(idx+1, [e2_s, e2_s + 4, '<e2>'])
                    #             offset = offset[:idx+2] + [[si+4, ei+4, w] for si, ei, w in offset[idx+2:]]
                    #             break
                    #
                    #     for idx, (s_idx, e_idx, w) in enumerate(offset):
                    #         if e_idx == e2_e:
                    #             offset.insert(idx+1, [e2_e, e2_e + 5, '</e2>'])
                    #             offset = offset[:idx+2] + [[si+5, ei+5, w] for si, ei, w in offset[idx+2:]]
                    #             break
                    # else:
                    #     for idx, (s_idx, e_idx, w) in enumerate(offset):
                    #         if e2_s == 0:
                    #             offset.insert(0, [e2_s, e2_s + 4, '<e2>'])
                    #             offset = offset[:1] + [[si+4, ei+4, w] for si, ei, w in offset[1:]]
                    #             break
                    #
                    #         if e_idx == e2_s:
                    #             offset.insert(idx+1, [e2_s, e2_s + 4, '<e2>'])
                    #             offset = offset[:idx+2] + [[si+4, ei+4, w] for si, ei, w in offset[idx+2:]]
                    #             break
                    #
                    #     for idx, (s_idx, e_idx, w) in enumerate(offset):
                    #         if e_idx == e2_e:
                    #             offset.insert(idx+1, [e2_e, e2_e + 5, '</e2>'])
                    #             offset = offset[:idx+2] + [[si+5, ei+5, w] for si, ei, w in offset[idx+2:]]
                    #             break
                    #
                    #     for idx, (s_idx, e_idx, w) in enumerate(offset):
                    #         if e_idx == e1_s:
                    #             offset.insert(idx+1, [e1_s, e1_s + 4, '<e1>'])
                    #             offset = offset[:idx+2] + [[si+4, ei+4, w] for si, ei, w in offset[idx+2:]]
                    #             break
                    #
                    #     for idx, (s_idx, e_idx, w) in enumerate(offset):
                    #         if e_idx == e1_e:
                    #             offset.insert(idx+1, [e1_e, e1_e + 5, '</e1>'])
                    #             offset = offset[:idx+2] + [[si+5, ei+5, w] for si, ei, w in offset[idx+2:]]
                    #             break

                    # sent_cut_upt = [w for _, _, w in offset]

                    fout.write('{}\t{}\t{}\t{}\n'.format(r_name, ' '.join(sent_cut), fn, rid))
    # print(df)
    # 数据清洗
    # 数据处理
    # 1.观看人群性别
    gender = df['gender'].value_counts().sort_index(ascending=1)
    print(gender)
    gender_title = '观看人群性别比'
    pie(gender, gender_title)
    # 2.评分情况
    score = df['score'].value_counts().sort_index(ascending=1)
    print(score)
    score_title = '评分占比情况'
    pie(score, score_title)

    # 3.影评分析
    df['comment'].to_csv('zhuxian.txt', index=False)
    f = open('zhuxian.txt', 'r', encoding='utf-8')
    txt = f.read()
    f.close()
    words = jieba.lcut(txt)
    # print(words)
    nettxt = ' '.join(words)
    # print(nettxt)
    wordcloud = WordCloud(background_color='white',
                          width=800,
                          height=600,
                          font_path='msyh.ttc',
                          max_words=200,
                          max_font_size=80).generate(nettxt)
    wordcloud.to_file('诛仙评论词云.png')
def tokenize(sents):
    res = []
    for sent in sents:
        res.append(jieba.lcut(sent[0]))
    return res
Пример #59
0
import jieba
s = "中国特色社会主义进入新时代,我国社会主要矛盾已经转化为人民日益增长的美好生活需要和不平衡不充分的发展之间的矛盾。"
n = len(s)
m = len(jieba.lcut(s))
print("中文字符数为{},中文词语数为{}。".format(n, m))
Пример #60
0
    def predictByBeamSearch(self,
                            inputSeq,
                            beamWidth=10,
                            maxAnswerLength=32,
                            alpha=0.7,
                            isRandomChoose=False,
                            allRandomChoose=False,
                            improve=True,
                            showInfo=False):
        outputSize = len(self.id2word)
        inputSeq = filter_sent(inputSeq)
        inputSeq = [
            w for w in jieba.lcut(inputSeq) if w in self.word2id.keys()
        ]

        X = seq2id(self.word2id, inputSeq)
        XLens = torch.tensor([len(X) + 1], dtype=torch.int, device=self.device)
        X = X + [eosToken]
        X = torch.tensor([X], dtype=torch.long, device=self.device)

        d = int(self.encoderRNN.bidirectional) + 1
        hidden = torch.zeros(
            (d * self.encoderRNN.numLayers, 1, self.hiddenSize),
            dtype=torch.float32,
            device=self.device)
        encoderOutput, hidden = self.encoderRNN(X, XLens, hidden)
        hidden = hidden[-d * self.decoderRNN.numLayers::2].contiguous()

        Y = np.ones([beamWidth, maxAnswerLength], dtype='int32') * eosToken
        # prob: beamWidth × 1
        prob = np.zeros([beamWidth, 1], dtype='float32')
        decoderInput = torch.tensor([[sosToken]],
                                    dtype=torch.long,
                                    device=self.device)
        # decoderOutput: 1 × 1 × outputSize; hidden: numLayers × 1 × hiddenSize
        decoderOutput, hidden, decoderAttentionWeight = self.decoderRNN(
            decoderInput, hidden, encoderOutput)
        # topv: 1 × 1 × beamWidth; topi: 1 × 1 × beamWidth
        topv, topi = decoderOutput.topk(beamWidth)
        # decoderInput: beamWidth × 1
        decoderInput = topi.view(beamWidth, 1)
        for i in range(beamWidth):
            Y[i, 0] = decoderInput[i].item()
        Y_ = Y.copy()
        prob += topv.view(beamWidth, 1).data.cpu().numpy()
        prob_ = prob.copy()
        # hidden: numLayers × beamWidth × hiddenSize
        hidden = hidden.expand(-1, beamWidth, -1).contiguous()
        localRestId = np.array([i for i in range(beamWidth)], dtype='int32')
        encoderOutput = encoderOutput.expand(
            beamWidth, -1, -1)  # => beamWidth × 1 × hiddenSize
        for i in range(1, maxAnswerLength):
            # decoderOutput: beamWidth × 1 × outputSize; hidden: numLayers × beamWidth × hiddenSize; decoderAttentionWeight: beamWidth × 1 × XSeqLen
            decoderOutput, hidden, decoderAttentionWeight = self.decoderRNN(
                decoderInput, hidden, encoderOutput)
            # topv: beamWidth × 1; topi: beamWidth × 1
            if improve:
                decoderOutput = decoderOutput.view(-1, 1)
                if allRandomChoose:
                    topv, topi = self._random_pick_k_by_prob(decoderOutput,
                                                             k=beamWidth)
                else:
                    topv, topi = decoderOutput.topk(beamWidth, dim=0)
            else:
                topv, topi = (torch.tensor(prob[localRestId],
                                           dtype=torch.float32,
                                           device=self.device).unsqueeze(2) +
                              decoderOutput).view(-1, 1).topk(beamWidth, dim=0)
            # decoderInput: beamWidth × 1
            decoderInput = topi % outputSize

            idFrom = topi.cpu().view(-1).numpy() // outputSize
            Y[localRestId, :i + 1] = np.hstack(
                [Y[localRestId[idFrom], :i],
                 decoderInput.cpu().numpy()])
            prob[localRestId] = prob[
                localRestId[idFrom]] + topv.data.cpu().numpy()
            hidden = hidden[:, idFrom, :]

            restId = (decoderInput != eosToken).cpu().view(-1)
            localRestId = localRestId[restId.numpy().astype('bool')]
            decoderInput = decoderInput[restId]
            hidden = hidden[:, restId, :]
            encoderOutput = encoderOutput[restId]
            beamWidth = len(localRestId)
            if beamWidth < 1:
                break
        lens = [
            i.index(eosToken) if eosToken in i else maxAnswerLength
            for i in Y.tolist()
        ]
        ans = [''.join(id2seq(self.id2word, i[:l])) for i, l in zip(Y, lens)]
        prob = [prob[i, 0] / np.power(lens[i], alpha) for i in range(len(ans))]
        if isRandomChoose or allRandomChoose:
            prob = [np.exp(p) for p in prob]
            prob = [p / sum(prob) for p in prob]
            if showInfo:
                for i in range(len(ans)):
                    print((ans[i], prob[i]))
            return random_pick(ans, prob)
        else:
            ansAndProb = list(zip(ans, prob))
            ansAndProb.sort(key=lambda x: x[1], reverse=True)
            if showInfo:
                for i in ansAndProb:
                    print(i)
            return ansAndProb[0][0]