예제 #1
0
def recommendTag(category_parent_dict):
	outfile = open('tag_recommend_result.txt','wb')
	print 'loading jieba userdict'
	jieba.load_userdict('../../../data/jieba_userdict.txt')
	print 'loading stopword'
	stopword_set = text_process.getStopword('../../../data/stopword.txt')
	print 'reading app json'
	infile = open('../data/'+category_path+'.json','rb')
	for row in infile:
		json_obj = json.loads(row.strip())
		app_id = int(json_obj["soft_id"])
		app_name = json_obj["soft_name"]
		app_brief = json_obj["soft_brief"]
		app_download = int(json_obj["download_times"])
		outfile.write(str(app_id)+'<@>'+app_name+'<@>'+app_brief+'<@>')
		tag_recommend_set = set([])
		for category in category_parent_dict.keys():
			if category in app_name or category in app_brief:
				for parent_tuple in category_parent_dict[category]:
					if parent_tuple[1] == 0:
						tag_recommend_set.add(parent_tuple[0])
					else:
						tag_recommend_set.add(category)
					if parent_tuple[1] == 2:
						tag_recommend_set.add(parent_tuple[0])

		outfile.write(','.join(tag_recommend_set))
		outfile.write('\r\n')
예제 #2
0
def load_training_data(model_path,user_dic_name=''):
	if user_dic_name != '':
		jieba.load_userdict(user_dic_name)
	global pos_sentiment_dic
	global neg_sentiment_dic
	global pos_word_count
	global neg_word_count
	global pos_prior
	global neg_prior
	cnx = sqlite3.connect(model_path+'model.db')
	cur = cnx.cursor()
	cur.execute('SELECT word,value FROM sentiment_positive_word')
	results = cur.fetchall()
	for result in results:
		pos_sentiment_dic[result[0]] = result[1]
	cur.execute('SELECT word,value FROM sentiment_negative_word')
	results = cur.fetchall()
	for result in results:
		neg_sentiment_dic[result[0]] = result[1]
	cur.execute('SELECT positive_word_count,negative_word_count,positive_document_count,negative_document_count FROM sentiment_baseline',)
	result = cur.fetchone()
	pos_word_count = int(result[0])
	neg_word_count = int(result[1])
	positive_document_count = float(result[2])
	negative_document_count = float(result[3])
	pos_prior = positive_document_count/(negative_document_count+positive_document_count)
	neg_prior = negative_document_count/(negative_document_count+positive_document_count)
예제 #3
0
def word_list(path = conf.output_dir + "/tmp/"):
    jieba.initialize()
    jieba.load_userdict("./user_dict.txt")
    print "cutting words"
    dict = {}
    f = open(path+"/all_json.txt", "r")
    i = 0
    for line in f:
        if (i %100) == 0:
            sys.stderr.write(str(i) + "\n")
        i += 1
        json_obj = json.loads(line)
        danmu = json_obj['ci']
        for k in danmu.keys():
            words_list = danmu[k]
            word = jieba.cut(words_list)
            for w in list(word):
                if w in dict.keys():
                    dict[w] += 1
                else:
                    dict[w] = 1

    f.close()

    out = codecs.open(path + "words.txt", "wb", "utf-8")
    for k in dict.keys():
        out.write(k)
        out.write(" ")
        out.write(unicode(dict[k]))
        out.write("\n")

    out.close()
예제 #4
0
 def __init__(self,type='default',user_pre=None,userdict_path=None,stop_words_path=None,words=None,params={}):
     '''
     type 'user':user_pre callable,params
     type 'default':userdict_path,stop_words_path,words
     type 'None':do Nonthing
     '''
     self.dict=userdict_path
     if self.dict!=None:
         jieba.load_userdict(self.dict)
     
     self.stop_words=['',' ','  ','\n','\r','\t']
     if stop_words_path!=None:
         self.stop_words=self.stop_words+[i.strip() for i in open('stop_words_path','r')]
     self.stop_words=set(self.stop_words)
     self.params=params
     if words!=None:
         self.words=set(words)
     else:
         self.words=None
     
     if type=='user':
         self.pre=user_pre
     elif type=='default':
         self.pre=partial(default_preprocess,stop_words=self.stop_words,words=self.words)
     elif type=='None':
         self.pre=lambda x:x.strip()
예제 #5
0
def main():
    # Configure Argument Parser
    parser = argparse.ArgumentParser(description="Derive basic statistics of Chinese context.")
    parser.add_argument("srcfile", help="the plain text file of the Chinese context")
    parser.add_argument("-d", "--userdict", help="optional user-defined dictionary", default="")
    parser.add_argument("-s", "--stopwords", help="optional stop words list", default="")
    parser.add_argument("-o", "--output", help="the prefix of output files", default="output")
    parser.add_argument("-k", "--topK", help="max length of the list of tf-idf/text-rank", default=100)
    args = parser.parse_args()
    # Open the text file
    with open(args.srcfile, "r") as f:
        content = f.readlines()
    # Combine all content
    allContent = "".join(content)
    # Clean the text

    # Load user dictionary and stop words
    if args.userdict != "":
        jieba.load_userdict(args.userdict)
    if args.stopwords != "":
        jieba.analyse.set_stop_words(args.stopwords)
    # Segmentation
    segmentedContent = jieba.cut(allContent, HMM=False)
    # TF-IDF
    tfidf = jieba.analyse.extract_tags(allContent, topK=args.topK, withWeight=True)
    # text-rank
    tr = jieba.analyse.textrank(allContent, topK=args.topK, withWeight=True)
    # Output
    writeToTxt(segmentedContent, args.output + "_segmented.txt")
    writeListToCsv(tfidf, args.output + "_tfidf.csv", ["term", "tfidf"])
    writeListToCsv(tr, args.output + "_textrank.csv", ["term", "text-rank"])
def small_test(corpus, ner_dictionary):
    jieba.load_userdict(ner_dictionary)

    for sentences in corpus:
        words = pseg.cut(sentences)
        for word, flag in words:
            print('%s %s' % (word, flag))
예제 #7
0
def start():
    sentence = raw_input('請輸入句子:')

    # jieba.enable_parallel(2) # 開啟多執行緒,参数為Thread數
    # jieba.disable_parallel() # 關閉多執行緒

    use_dict = True        # 是否使用繁體詞庫
    use_user_dict = False  # 是否使用使用者自定義詞庫

    if use_dict:
        jieba.set_dictionary('dict/dict.txt.big')

    if use_user_dict:
        jieba.load_userdict('dict/user_dict.txt')

    getFullMode(sentence)
    getFullModeHMM(sentence)
    getAccurate(sentence)
    getAccurateHMM(sentence)
    getNewWord(sentence)
    getSearch(sentence)
    getPostag(sentence)
    getTokenize(sentence)
    getKeyWord(sentence)
    getKeyWord(sentence, 'TextRank')
예제 #8
0
파일: my_kw.py 프로젝트: Minzc/YZKnowl
def gen_obj_na(infile, usrdic = '',objname = "伊利谷粒多"):
    if usrdic != "":
        jieba.load_userdict(usrdic)
    lines = [ ln.strip().decode('utf-8') for ln in open(infile).readlines()]
    for ln in lines:
        kws = list(jieba.posseg.cut(kw_util.tweet_filter(ln)))
        find_obj = False
        kws_lst = combine_kw(kws)
        print 'line:\t' + ln.encode('utf-8') + '\n'

        print "tag:\t",
        for w in jieba.analyse.extract_tags(kw_util.tweet_filter(ln)):
            print w.encode('utf-8'),
        print

        print 'seg:\t',
        for w in kws_lst:
            print w.word.encode('utf-8')+'/'+w.flag.encode('utf-8'),
        print

        for w in kws_lst:
            if w.word.encode('utf-8') == objname:
                find_obj = True
            if find_obj:
                if 'n' in w.flag :
                    print w.word.encode('utf-8')+"/n",
                if 'a' in w.flag:
                    print w.word.encode('utf-8')+"/a",
        print
예제 #9
0
파일: my_kw.py 프로젝트: Minzc/YZKnowl
def gen_bias_test(infile, usrdic = ""):
    if usrdic != "":
        jieba.load_userdict(usrdic)
    lines = [ ln.strip().decode('utf-8') for ln in open(infile).readlines()]
    kwdist = nltk.FreqDist()
    kwpair_mtr = {}
    for ln in lines:
        sublns = kw_util.tweet_filter(ln).split(' ')
        for subln in sublns:
            kws = jieba.cut(subln)
            kws_arr = []
            for kw in kws:
                if kw != ' ':
                    kws_arr.append(kw)

            for i in range(len(kws_arr)):
                kwdist.inc(kws_arr[i])
                for j in range(i+1,len(kws_arr)):
                    kwpair_mtr = add_kw_pair(kws_arr[i],kws_arr[j],kwpair_mtr)

    kfdist = {}

    for kw,count in kwdist.items():
        kfscore = 0
        for fqkw,fqcount in kwdist.items()[:100]:
            if kw != fqkw:
                if(kwpair_mtr.has_key(kw)):
                    kfscore += math.pow(kwpair_mtr[kw].get(fqkw,0)-fqcount*count,2)/(fqcount*count)
        kfdist[kw] = kfscore
    sorted_x = sorted(kfdist.iteritems(), key=operator.itemgetter(1))
    for k,scr in sorted_x:
        print k,scr
예제 #10
0
파일: init.py 프로젝트: JOHNKYON/Lab_models
def text_digitalize(raw):
    """
    将文本初始化并数字化
    :param raw:
    :return:
    """
    raw_without_space = map(lambda x: [re.sub('\s*', '', x[0])], raw)

    raw_without_space = [x[0] for x in raw_without_space]

    # temp = codecs.open("temp/corpus.txt", 'wb', encoding='utf8')

    jieba.load_userdict("data/jieba_dict.txt")
    raw_cut = [jieba.cut(x, cut_all=False) for x in raw_without_space]

    # for ele in raw_cut:
    #     print ele[0][0]

    # for ele in raw_cut:
    #     for a in ele:
    #         temp.write(a+'\t')
    #     temp.write('\n')
    #
    # temp.close()

    raw_without_sw = map(lambda x: [filter(lambda y: y not in stopwords, x)], raw_cut)

    # for ele in raw_doc:
    #     print ele[0][0]

    raw_doc = [x[0] for x in raw_without_sw]
    dic_corpus = algorithm_collection.digitalize(raw_doc)
    return dic_corpus
예제 #11
0
파일: init.py 프로젝트: JOHNKYON/Lab_models
def text_init(raw):
    """
    去除文本中的停用词,分词
    :param raw:
    :return 分词后的list,list中的元素为2级list,2级list中的元素为词。此时1级list中元素已经是未数字化的词向量:
    """

    # 去除空格符
    raw_without_space = re.sub(' *', '', raw)
    # 将不同的专业分开作list元素
    # 专业号的正则表达式
    major_re = re.compile(u"\d{6}\D")

    '''# 用于测试匹配数量不一致问题
    out_test = codecs.open("data/re_test.txt", 'wb', encoding='utf8')
    re_find = major_re.findall(raw_without_space)
    counter = 0
    for ele in re_find:
        out_test.write(str(counter) + '\t' + ele + '\n')
        counter += 1
    out_test.close()'''

    # 对整个字符串进行切片,分割符为此正则表达
    raw_splited = major_re.split(raw_without_space)[1:]

    # 分词
    # 载入自定义词典
    jieba.load_userdict("data/jieba_dict.txt")
    raw_cut = map(lambda x: jieba.cut(x, cut_all=False), raw_splited)
    # 去除停用词
    raw_without_sw = map(lambda x: filter(lambda y: y not in stopwords, x), raw_cut)
    return raw_without_sw
예제 #12
0
파일: init.py 프로젝트: JOHNKYON/Lab_models
def neural_init(raw):
    """
    用于将clean_person中的字段初始化为神经网络能接受的初始值
    字符串只简单分词
    :param raw:
    :return:
    """
    test = raw[0]
    re.sub('\s*', '', test[0])
    re.sub('\s*', '', test[1])
    re.sub('\s*', '', test[3])

    raw_without_space = map(lambda a: [re.sub('\s*', '', a[0]), re.sub('\s*', '', a[1]), a[2],
                                       re.sub('\s*', '', str(a[3])), re.sub('\s*', '', str(a[4])), a[5]], raw)

    jieba.load_userdict("data/jieba_dict.txt")
    raw_cut = [[jieba.cut(x[0], cut_all=False), jieba.cut(x[1], cut_all=False), x[2], jieba.cut(x[3], cut_all=False),
                jieba.cut(x[4], cut_all=False), x[5]] for x in raw_without_space]

    raw_without_sw = map(lambda a: [filter(not_in, a[0]), filter(not_in, a[1]), a[2], filter(not_in, a[3]),
                                    filter(not_in, a[4]), a[5]], raw_cut)

    # 将所有文本list合并在一起
    raw_text_all = map(lambda a: a[0]+a[1]+a[3]+a[4], raw_without_sw)

    # 产生词典和语料库
    dic_corpus = algorithm_collection.digitalize(raw_text_all)

    # 生成词典矩阵
    arr = matrix_former(dic_corpus[0], dic_corpus[1], dic_corpus)

    raw_digitalized = map(lambda a, b: np.hstack((a, [b[2], b[5]])), arr, raw_without_sw)

    return raw_digitalized
예제 #13
0
파일: __init__.py 프로젝트: mapix/Pinyin
    def __init__(self):

        self.word_to_pinyins = defaultdict(list)
        f = open(FILE_WORDS, 'rb')
        for line in f:
            pinyin, words = line.strip().decode("utf-8").split()
            for item in words:
                self.word_to_pinyins[item].append(pinyin)
        f.close()

        self.word_to_pinyin = {}
        f = open(FILE_WORD, 'rb')
        for line in f:
            word, pinyin = line.strip().decode("utf-8").split(",")
            self.word_to_pinyin[word] = pinyin
        f.close()

        self.term_to_pinyin = {}
        f = open(FILE_TERM, 'rb')
        for line in f:
            term, pinyin = line.strip().decode("utf-8").split("#")
            self.term_to_pinyin[term] = pinyin.split("@")
        f.close()

        f = open(FILE_USER_DICT, 'rb')
        jieba.setLogLevel(logging.INFO)
        jieba.initialize()
        jieba.load_userdict(f)
        f.close()
예제 #14
0
파일: score.py 프로젝트: frankyzhou/stock
def calEmotion(filename,city):
	# load dict
	most,very,more,insuff,ish,posdict,negdict,over=dict.init_dict()
	jieba.load_userdict("dic/stock_dict.txt")
	# init score
	score = {}
	cur_date = 0
	review_id =0
	#init excel hander
	w, ws = createExcel()
	# open database & get data
	rawdata, conn, cur = util.getData(filename,city)
	for row in rawdata:
		# collect tokens from yield
		seg_list,cur_date, review = getTokens(score,row,cur_date)
		#use sentiment score to cal the emotion by one comment and add it to score.	
		result = dict.sentiment_score_list(seg_list,posdict,negdict,most,very,more,ish,insuff,over)
		score[cur_date].append(result)
		# write down the review to xls
		review_id = review_id +1
		ws.write(review_id,0,review)
		ws.write(review_id,1,result)
	w.save("excel/" + filename + ".xls")
	# close the connection with database
	util.closeDB(conn, cur)
	# generate the emotionlist from score
	emotionList = genEmotionList(score)
	# show pic of the stock price and emotion by one stock
	showPriceAndEmotion(emotionList,filename,city)
def mineAbbreviation():
	print 'mining abbreviation'
	jieba.load_userdict("../../../data/jieba_userdict.txt")
	stopword_set = text_process.getStopword('../../../data/stopword.txt')
	word2vec_model = Word2Vec.load('../../../data/word2vec.model')
	word_set = getWords()
	word_syn_dict = {}
	for word in word_set:
		word_syn_dict.setdefault(word,set([word]))
		if len(word) != 2:
			continue
		try:
			for simi_word_tuple in word2vec_model.most_similar(positive=[word],topn=20):
				simi_word = simi_word_tuple[0]
				simi_value = simi_word_tuple[1]
				reverse_word = word[1]+word[0]
				if reverse_word == simi_word:
					pass
				else:	
					if len(set(word)&set(simi_word)) != len(word) or simi_value < 0.5 or word in simi_word or reverse_word in simi_word:
						continue
				word_syn_dict[word].add(simi_word)
		except:
			pass
			# print word

	outfile = open('abbreviation.txt','wb')
	for word in word_syn_dict.keys():
		if len(word_syn_dict[word])>=2:
			outfile.write(word+'@'+','.join(word_syn_dict[word])+'\r\n')	
    def __init__(self):
        # load the jieba user.f_dict
        root_path = os.path.dirname(os.path.abspath(__file__))
        jieba.load_userdict(os.path.join(root_path, "f_dict/user.dict"))

        # get the positive f_corpus and length
        self.pos_doc_list = []
        with open(os.path.join(root_path, "f_corpus/waimai/positive_corpus_v1.txt"), encoding="utf-8") as pos_f:
            for line in pos_f:
                # self.pos_doc_list.append(list(set(jieba.lcut(line.strip()))))
                self.pos_doc_list.append(jieba.lcut(line.strip()))
        self.pos_doc_length = len(self.pos_doc_list)

        # get the negative f_corpus and length
        self.neg_doc_list = []
        with open(os.path.join(root_path, "f_corpus/waimai/negative_corpus_v1.txt"), encoding="utf-8") as pos_f:
            for line in pos_f:
                # self.neg_doc_list.append(list(set(jieba.lcut(line.strip()))))
                self.neg_doc_list.append(jieba.lcut(line.strip()))
        self.neg_doc_length = len(self.neg_doc_list)

        # define the variable about train number
        self.pos_train_num = 0
        self.neg_train_num = 0
        self.pos_test_num = 0
        self.neg_test_num = 0

        runout_content = "You are using the waimai f_corpus version 1.0.\n"
        runout_content += "There are total %d positive and %d negative f_corpus." % \
                          (self.pos_doc_length, self.neg_doc_length)
        print(runout_content)
예제 #17
0
    def cal_and_show_jd_hot_words(self, jd_dir='../spider/jd'):
        """
        calculate and show hot words of Job Description (JD)
        :param jd_dir:
        :return:
        """
        if not os.path.exists(jd_dir) or len(os.listdir(jd_dir)) == 0:
            print('Error! No valid content in {0}'.format(jd_dir))
            sys.exit(0)
        else:
            jd_and_dir = {_.split('.')[0]: os.path.join(jd_dir, _) for _ in os.listdir(jd_dir)}

            for k, v in jd_and_dir.items():
                text = "".join(pd.read_excel(v)['详情描述'])
                jieba.analyse.set_stop_words(STOPWORDS_PATH)
                jieba.load_userdict(USER_CORPUS)
                hot_words_with_weights = jieba.analyse.extract_tags(text, topK=30, withWeight=True, allowPOS=())

                frequencies = {_[0]: _[1] for _ in hot_words_with_weights}

                print(frequencies)

                x, y = np.ogrid[:300, :300]
                mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
                mask = 255 * mask.astype(int)

                wordcloud = WordCloud(font_path='./msyh.ttf', width=600, height=300, background_color="white",
                                      repeat=False,
                                      mask=mask)
                wordcloud.generate_from_frequencies(frequencies)

                import matplotlib.pyplot as plt
                plt.imshow(wordcloud, interpolation='bilinear')
                plt.axis("off")
                plt.show()
예제 #18
0
 def __init__(self, src_file_name, dst_file_name):
     self.__srcFileName = src_file_name  # initialize input file name
     self.__dstFileName = dst_file_name  # initialize output file name
     self.__resultList = []  # an internal list to store outputs
     jieba.load_userdict('data/dict.txt')
     with open(src_file_name) as f:
         f_csv = csv.reader(f)
         headers = next(f_csv)
         for row in f_csv:  # read and manipulate every row in the csv file
             document_id = row[0]  # extract document id
             article = row[1]  # extract article
             washed_article = wash_data(article)
             sentences = split_sentence(washed_article)  # 注意:一句话内可能包含多组property和value
             p_v_dict = {}
             for sentence in sentences:
                 for (p, v) in get_properties_and_values(sentence).items():
                     log.a(document_id, p, v)
                     p = adjust(p)
                     if p is not None:
                         if p in p_v_dict:
                             if p_v_dict[p] < v:
                                 p_v_dict[p] = v
                         else:
                             p_v_dict[p] = v
             for (p, v) in p_v_dict.items():
                 self.__resultList.append((document_id, p, v))
예제 #19
0
def WordSegmentation(sentence, type='jieba'):
	if type == 'jieba':
		from jieba import posseg
		jieba.load_userdict('../Dict/NetWords/networds.txt')
		# jieba.load_userdict('../Dict/Emoji/emoji2.txt')
		jieba.load_userdict('../Dict/NetWords/amazon.txt')
		# jieba.add_word('😁', 20)
		word_tag_list = posseg.cut(sentence.encode('utf-8'))
		word_list = list()
		for each in word_tag_list:
			each = str(each).split('/')
			word_list.append((each[0], each[1]))
		return word_list
	elif type == 'ltp':
		url_get_base="http://api.ltp-cloud.com/analysis/?"
		api_key="Y5a5D3B4xp9ujH4nyDUXvVlNNOCfyuhftwrXWVbA"
		format='plain'
		pattern='ws'	# ws为分词
		result=urllib2.urlopen\
		("%sapi_key=%s&text=%s&format=%s&pattern=%s" % \
			(url_get_base, api_key, sentence.encode('utf-8'), format, pattern))
		content=result.read().strip()
		return content
	else:
		return ""
예제 #20
0
def extract_desc(data_list: []) -> []:
    result_list = []
    # 载入自定义词典
    jieba.load_userdict('/home/laomie/keywords.csv')

    for row in data_list:
        if (len(row) > 6):
            begin_ip_desc = row[5]
            desc_list = str(begin_ip_desc).split(' ')
            if len(desc_list) > 1 :
                area = desc_list[0]
                edu = desc_list[1]
                province = ''
                city = ''
                # 结巴分词提前省份,城市
                terms = jieba.cut(area)
                seg_list = ','.join(terms).split(',')
                if (len(seg_list) > 0):
                    if (seg_list[0] == '中国'):
                        temp_list = seg_list[1:]
                        if (len(temp_list) > 0):
                            province = temp_list[0]
                        if (len(temp_list) > 1):
                            city = temp_list[1]

                result = []
                result += row[:5]
                result += [province, city, edu]
                result += row[5:]
                result_list.append(result)

    return result_list
def readSegFile():
	jieba.load_userdict("../../data/jieba_userdict.txt")
	infile = open('../../data/all_cn_seg_nwi_clean.txt','rb')
	outfile = open('../../data/all_word.txt','wb')
	stopword_set = text_process.getStopword('../../data/stopword.txt')
	word_set = set([])
	word_fre_dict = {}
	row_counter = 0
	for row in infile:
		row_counter += 1
		print row_counter
		row = row.strip().decode('utf-8')
		items = row.split('<@>')
		app_name = items[1]
		brief_seg = items[2].split()
		title_seg = jieba.cut(app_name)
		for title in title_seg:
			if text_process.isChinese(title) and title not in stopword_set:
				word_set.add(title)
				word_fre_dict.setdefault(title,0)
				word_fre_dict[title] += 1
		for brief in brief_seg:
			if text_process.isChinese(brief) and brief not in stopword_set:
				word_set.add(brief)
				word_fre_dict.setdefault(brief,0)
				word_fre_dict[brief] += 1

	sorted_list = sorted(word_fre_dict.items(),key=lambda p:p[1],reverse=True)
	for val in sorted_list:
		if val[1] >= 10:
			outfile.write(val[0]+','+str(val[1])+'\r\n')
def tokenizer_word(iterator):
    jieba.load_userdict('./dict.txt')
    for sentence in iterator:
        sentence = sentence.decode("utf8")
        sentence = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!,。:??、~@#¥%……&*()]+".decode("utf8"), "".decode("utf8"),
                          sentence)
        yield list(jieba.lcut(sentence))
def main():
    currentDir = os.path.abspath('.')
    userDictFile = os.path.join(currentDir,'userdict.txt')
    jieba.load_userdict(userDictFile)
    df = pd.read_csv("./processedtimeandlocation.csv")
   # df.drop(df.columns[-1], axis=1,inplace=True)
    m = df.shape[0]#获得DataFrame 只要.shape即可
    n = df.shape[1]#.shape 会获得一turple [0]为行 [1]为列
    # locationDF = pd.DataFrame({ '地点' : ''})
    # locationDF = ['地点']
    #df['去除地点和时间的句子'] = None
    #df['时间'] = None #增加新的一列,只要df['列名']即可 ,可以初始化为None
    #content = df['内容摘要']#获取一列数据为 df['列名']
    content = df['去除地点和时间的句子']
    segmentationContent = []
   # Dates = []
   # leftStrs = []
   # pattern = re.compile(r'([12]?[0-9]?[0-9]{1}[0-9]{1}[-/\.年])?([0-1]?[0-9]{1}[-/\.月][份]?)?([0-3]?[0-9]{1}[-/\.日]?)?([每天]?[早上]?[上午]?[中午]?[晚上]?[时]?)([\s])*([0-2]?[0-9][点]?[:]?[0-6][0-9][分]?)?')
    #pattern = re.compile(r'([0-9]{4}[-/\.年])?([0-1]?[0-9]{1}[-/\.月])?([0-3]?[0-9]{1}[-/\.日]?)?([\s])*([0-2]?[0-9][点]?[:]?[0-6][0-9][分]?)?')#定义匹配模式
   # pattern = re.compile(r'([0-9]{4}[-/\.年])*([0-1]?[0-9]{1}[-/\.月])+([0-3]?[0-9]{1}[-/\.日]?)([0-2]?[0-9][点]?[:]?[0-6][0-9][分]?)?')#定义匹配模式
    #jieba.add_word("年 t")  
    for i in range(m):
        string = content[i]
        seg = posseg.cut(string)
        sentence = []
        for w in seg:
            if(w.flag != 'x' and w.word !=' '):
              sentence.append(w.word) 
        segmentationContent.append(' '.join(sentence))
예제 #24
0
	def __init__(self, ) :
		# setting.
		self.dict_path = 'dict/user.dict'
		self.talk_count = 0

		# language. 
		self.last_msg = None
		self.last_response = None
		self.keyword_dict = None
		color.beginComment()
		jieba.load_userdict(self.dict_path)
		color.end()

		# state.
		self.state_key = None
		self.state_piece = None
		self.active_class = None
		self.active_piece = None
		self.state_vspecific = None # more specific peieces than *state_piece*
		self.state_vspecific_extra = None # which categories have such more specific info.
		self.current_db = None
		self.dim = 0
		self.dialogue_color = color.pcolors.ENDC

		# handle yes/no question.
		self.yesno_list = list()
예제 #25
0
    def analyze_structure(self, sentence):
        jieba.load_userdict(os.path.join(self.base_path, self.dict_file))
        pairs = jieba.posseg.cut(sentence)

        prepared_sentence = []
        for w, t in pairs:
            prepared_sentence.append((w, t))

        grammar = r"""
            SUB:
                {^<j|n|ng|nr|ns|nt|nz|r|x>+.*}
            PRE:
                {.*<v|vd|vg|vn>+.*}
            OBJ:
                {.*<j|n|nr|x>+$}
            ADV:
                {.*<p|r>+}
        """
        parser = nltk.RegexpParser(grammar)
        result = parser.parse(prepared_sentence)
        # result.draw()

        structure = {}
        for r in result:
            if isinstance(r, nltk.tree.Tree):
                # phrase = ""
                phrase = []
                for leaf in r.leaves():
                    # phrase += leaf[0]
                    phrase.append(leaf[0])
                structure[r.label()] = phrase
        return structure
예제 #26
0
def load_hlm(f_path, batch_size = 1):
    jieba.load_userdict("./data/hlm/name.dic")
    seqs = []
    i2w = {}
    w2i = {}
    lines = []
    data_xy = {}
    f = open(curr_path + "/" + f_path, "r")
    for line in f:
        line = line.strip('\n').lower()
        if len(line) < 3 or "手机电子书" in line:
            continue
        seg_list = jieba.cut(line)

        w_line = []
        for w in seg_list:
            if w not in w2i:
                i2w[len(w2i)] = w
                w2i[w] = len(w2i)
            w_line.append(w)
            if len(w_line) == 100:
                lines.append(w_line)
                w_line = []
        if len(w_line) < 100:
            lines.append(w_line)
    f.close
    seqs = lines
    data_xy = batch_index(seqs, i2w, w2i, batch_size)
    print "#dic = " + str(len(w2i))
    return seqs, i2w, w2i, data_xy
예제 #27
0
def main():
    starttime = datetime.datetime.now()

    path = os.path.abspath('.')
    path = path.split('/')
    basepath = "/".join(path[:-2])

    dictpath = os.path.join(basepath,'data/myDict.txt')
    jieba.load_userdict(dictpath)
    datapath = os.path.join(basepath,'data/train/relation_train/task1.trainSentence')
    with open(datapath) as f:
        dataset = f.readlines()    

    target_rel = u'朋友'
    for line in dataset:
        try:
            data = line[:-1].split('\t')
            rel = data[0].decode('utf-8')
            entity1 = data[1].decode('utf-8')
            entity2 = data[2].decode('utf-8')
            sentence = data[3].decode('utf-8')
            mark = int(data[4])
            if rel == target_rel:
                if mark == 1:
                    print sentence,entity1,entity2
        except Exception, e:
            print e
예제 #28
0
def word_segmentation(s):
    """
    Returns:
    seg_index: list of tuple, tuple(seg, place of this seg in s)
    """
    jieba.load_userdict('./data/usr.dict')
    seg_list = jieba.cut(s, cut_all=False)
#     result = pseg.cut(s)
#     for seg in result:
#         print(seg.word + '/' + seg.flag)

    seg_index = []
    last = 0
    print(seg_list)
    for seg in seg_list:
        seg = seg.strip("/")
        #print re.split('(《》)', seg)[0]
        begin = s.index(seg, last)
        last = begin + len(seg)
        seg_index.append((seg, begin))
        if sys.version[0] == '2':
            print ('(%s,%d)'%(seg,begin)),
#         elif sys.version[0] == '3':
#             print('(%s,%d)'%(seg,begin),end ="")
    print('\n')
    return seg_index
예제 #29
0
def main():
    starttime = datetime.datetime.now()

    path = os.path.abspath('.')
    path = path.split('/')
    basepath = "/".join(path[:-2])

    dictpath = os.path.join(basepath,'data/myDict.txt')
    jieba.load_userdict(dictpath)

    target_rel = u'夫妻'

    train_user_path =  os.path.join(basepath,'data/train_user.txt')
    with open(train_user_path) as f:
        userdata = f.readlines()

    userset = []
    for line in userdata:
        userset.append(line[:-1])

    for user in userset[0:1]:
        tupu_path = os.path.join(basepath,'data/train/entity_tupu/entity_tupu.%s' % user)
        with open(tupu_path) as f:
            tupu_data = f.readlines()

        entity_pair = []
        for line in tupu_data:
            data = line[:-1].split('\t')
            rel = data[0].decode('utf-8')
            entity1 = data[1].decode('utf-8')
            entity2 = data[2].decode('utf-8')
            if rel == target_rel:
                entity_pair.append([entity1,entity2])


        datapath = os.path.join(basepath,'data/train/entity_sentence/entity_sentence.%s' % user)
        with open(datapath) as f:
            dataset = f.readlines()    

        three_split_set = []
        for line in dataset:
            try:
                data = line[:-1].split('\t')
                entity1 = data[1].decode('utf-8')
                entity2 = data[2].decode('utf-8')
                sentence = data[0].decode('utf-8')
                if [entity1,entity2] in entity_pair or [entity2,entity1] in entity_pair:
                    print sentence,entity1,entity2
                    jieba.add_word(entity1,1000)
                    jieba.add_word(entity2,1000)
                    three_split = cut_sentence(sentence,entity1,entity2)
                    if three_split == None:
                        continue
                    three_split_set.append(three_split)

                    # if rel in sentence:
                        # print sentence
            except Exception, e:
                print e
예제 #30
0
def main():
    starttime = datetime.datetime.now()
    
    path = os.path.abspath('.')
    path = path.split('/')
    basepath = "/".join(path[:-2])
    dictpath = os.path.join(basepath,'data/myDict.txt')
    jieba.load_userdict(dictpath)
    datapath = os.path.join(basepath,'data/train/relation_train/task1.trainSentence')



    with open(datapath) as f:
        dataset = f.readlines()

    data_count = len(dataset)
    idf_dict = {}
    for line in dataset:
        temp_data = []
        data = line[:-1].split('\t')
        entity1 = data[1].decode('utf-8')
        entity2 = data[2].decode('utf-8')
        sentence = data[3].decode('utf-8')
        x = ft.get_word_feature(entity1,entity2,sentence)
        if x == None:
            continue
        x = x[0:8]  # get rid of non-word feature
        wordset = set([ w for w in x if w != 0])
        for w in wordset:
            idf_dict[w] = idf_dict.get(w, 0.0) + 1.0

    for w in idf_dict.keys():
        idf_dict[w] = math.log( data_count / idf_dict[w] )

    idf_dictname = 'data/idf_dict.pkl'
    idf_dictfile = open(os.path.join(basepath,idf_dictname),'w')
    pickle.dump(idf_dict,idf_dictfile)
    idf_dictfile.close()



    # idf_dictname = 'data/idf_dict.pkl'
    # idf_dictfile = open(os.path.join(basepath,idf_dictname))
    # idf_dict = pickle.load(idf_dictfile)
    # idf_dictfile.close()

    # idf_dict = sorted(idf_dict.iteritems(), key = lambda x:x[1], reverse = True)
    # print len(idf_dict)



    # for pair in idf_dict[0:100]:
    #     print pair[0],pair[1]




    endtime = datetime.datetime.now()
    print 'elapsed time is %f'  %(endtime - starttime).seconds
예제 #31
0
import jieba
import jieba.posseg as pesg

jieba.load_userdict(r".\load_file\person_dict.txt")  # 加载用户字典
character = ['\xa0', ' ', ',']  # 去除的字符串
entity = ['n', 'm', 'nr', 'x', 'ns', 'v', 'l', 'nz']  # 可能是实体的词性
addition = ['n', 'nr', 'x', 'ns', 'v', 'nz']  # 可以进行合并的词性


def create_spo(detail_content):
    """
    将句子解析成spo三元组
    :param detail_content: 
    :return: 
    """
    # print(detail_content)
    spo = []
    '''  反向匹配构建spo三元组  '''
    for index in range(len(detail_content) - 1, -1, -1):
        # print(detail_content[index])
        address = detail_content[0]
        if (detail_content[index].flag is 'x') and (
                detail_content[index - 1].flag is 'x') and index >= 0:
            spo.append(
                (address.word,
                 detail_content[index - 2].word + detail_content[index].word))
        if (detail_content[index].flag
                in addition) and (detail_content[index - 1].flag
                                  in addition) and index >= 0:
            # spo.append(((address.word, address.flag), (detail_content[index - 1].word, " ", detail_content[index].word)))
            spo.append(
예제 #32
0
import re
import jieba
import functools
from model import *

jieba.load_userdict('dict-traditional.txt')
jieba.load_userdict('sky_dragon_name.txt')

def normalize(x, y):
    if y== ',' or y == '。' or y == ':' or y == '「' or y == '」' or y == '!' or y == '?' or y == '‘' or y == '’':
        return x + y
    
    return y

def tokenize(x, unit):
    if unit == "char":
        x = re.sub(" ", "", x)
        return list(x)
    if unit == "word":
        result = []
        for xx in jieba.cut(x.strip()):
            # if xx== ' ' or xx == ',' or xx == '。' or xx == ':' or xx == '「' or xx == '」' or xx == '!' or xx == '?' or xx == '‘' or xx == '’':
            if xx != ' ' or x != ' ' or xx != '':
                result.append(xx)
                # if xx == ' ' or x== ' ':
                #     if len(result) != 0:
                #         result[-1] += xx
                # else:
                #     result.append(xx)
        
        return result
예제 #33
0
def jieba_init():
    jieba.set_dictionary("UserDict.txt")
    jieba.load_userdict("UserDict.txt")
예제 #34
0
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 25 10:14:06 2018

@author: Administrator
"""

import re
import logging
import numpy as np
import pandas as pd
from collections import Counter
import jieba
jieba.load_userdict('data_path/user_dict.txt')

logging.getLogger().setLevel(logging.INFO)


def clean_str(s):
    r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`ò¢{|}~!,。“”、\n:\tⅠⅡⅢⅣⅤⅥⅦⅧⅨⅩ()【】~; \r\n]'
    s = re.sub(r, '', s)
    seg_list = jieba.cut(s, cut_all=False)
    out_list = []
    for seg in seg_list:
        out_list.append(seg)
    out_s = ' '.join(out_list)
    return out_s


def load_embeddings(vocabulary):
    word_embeddings = {}
예제 #35
0
# File Name: make_vocabulary.py
# Description:
# 做训练用的基础词表

"""
import sys
import re
import jieba
import common_segment

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("need input_file & seg_output_file & tfdf_output_file")
        exit(0)
    usr_dict = '../data/jieba_user.dict.org.fil'
    jieba.load_userdict(usr_dict)

    # 把事件相关词加入分词词典
    rfile = open('../data/event_rel')
    word_dict = {}
    for line in rfile.readlines():
        line = line.decode('utf-8')
        line = line.rstrip('\n')
        tlist = line.split('\t')
        # 事件相关词
        for w in tlist[1:]:
            if w not in word_dict:
                word_dict[w] = {}
                jieba.add_word(w, 100)
    rfile.close()
예제 #36
0
#!/usr/bin/python
# -*- coding: utf-8 -*-#coding=utf-8
# ===========================================================
# 分词功能封装
# ===========================================================
import jieba
import jieba.posseg as pseg
from general_tools import write_file
from __init__ import debug_flag
self_defined_dict_path = 'conf/self_defined_corpus.txt'
if __name__ != '__main__' and __name__ != 'jieba_split':
    self_defined_dict_path = 'tools/' + self_defined_dict_path
jieba.load_userdict(self_defined_dict_path)


def split_word(sentence, filename='', cut_all=True, show_nominal=False):
    split_words = ''
    if not show_nominal:
        word_list = jieba.cut(sentence, cut_all)
    else:
        word_list = pseg.cut(sentence, cut_all)
    for word in word_list:
        if debug_flag:
            print word
        split_words += '%s ' % word
    if not filename == '':
        write_file(filename, split_words)
    return split_words


def split_word_only(sentence, filename='', cut_all=True, show_nominal=False):
예제 #37
0
import json
import pickle
import time
from typing import List

import jieba
jieba.dt.cache_file = 'jieba.cache.new'
jieba.load_userdict('../data/userdict.txt')
from set_bm25 import BM25  #if you want to unpickle a object, but the class definition is at other file, you should import this!


class Bm25_query:
    def __init__(self, contents, bm_system):
        self.contents = contents
        self.bm25 = BM25(import_data=bm_system)

    def query(self, keyword_set_str_list: List[str], optional: List[str]):
        keyword_set_list = []
        for keyword_set_str in keyword_set_str_list:
            keyword_set = list(jieba.cut(keyword_set_str))
            keyword_set = [x for x in keyword_set if x != ' '
                           ]  # remove ' ' such like 台大 管中閔 -> ['台大',' ','管中閔']
            keyword_set_list += [keyword_set]
            # TODO: Handle english convert case, eg. google -> Google

        beg_time = time.time()
        score_list = self.bm25.sim_all(keyword_set_list, optional)
        end_time = time.time()
        print('{} seconds for computing.'.format(end_time - beg_time))

        beg_time = time.time()
예제 #38
0
#coding:gbk
"""
目标:基于Python和Gelphi的《黎明破晓的街道》人物关系图谱构建
作者:林冰清
"""
import codecs
import jieba 
import os
from jieba import posseg
names = {}
relationship = {}
Names = []
jieba.load_userdict("character.txt")  # 加载人物表
with codecs.open("黎明破晓的街道.txt",'r','gbk')as f:
	lines = f.readlines()
	for line in lines:
		poss = jieba.posseg.cut(line)
		Names.append([])  # 增加人物列表
		for sun in poss:
			if sun.flag != 'nr' or len(sun.word)<2:
				continue
			Names[-1].append(sun.word)
			if names.get(sun.word) is None:  # 判断人物是否在字典中
				names[sun.word] = 0
				relationship[sun.word] = {}
			names[sun.word] += 1
for line in Names:
	for name_1 in line:
		for name_2 in line:
			if name_1 == name_2:
				continue
예제 #39
0
import re
import sys
import time
import json
import jieba
import sqlite3
import base64
import binascii
import requests
from bs4 import BeautifulSoup
from os import path
from PIL import Image
import numpy as np
import jieba
from jieba import analyse
jieba.load_userdict("userwords.txt")
import jieba.posseg as pseg
from snownlp import SnowNLP
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import timer

stopwords = open('stopwords.txt', 'rt', encoding='utf-8').readlines()


class WeiboSpider:
    def __init__(self):
        self.session = requests.session()
        self.connect = sqlite3.connect('data.db')
        self.cursor = self.connect.cursor()
        self.cookies = {
예제 #40
0
def getGroupWordsFrequency(data):  #统计得到各组自身词频和与相邻组的相关数据,用于提取重要的关键词
    lables = ['countNeg', 'countNeu', 'countPos']  # 用于构造字符串
    flag = 0
    jieba.load_userdict('addWords.TXT')  #添加自定义词汇
    # # 以下停用词是成对出现的字符中的另一半,同样为了减少特征冗余
    stop = ['”', '】', ')', '》', ')']
    # 将以下常见微博用语统称为“转发”
    repost = ['转发', '快转', 'Repost', 'RepostWeibo']
    i = -1
    diction = []
    print('开始')
    for lable in data:  #data由textFormalize得到
        i = i + 1
        wordNum = 0  #该组的词汇个数(含重复)
        myDiction = dict()
        for text in lable:  #遍历处理该组的每一句
            words = jieba.lcut(text)
            for word in words:  #遍历处理该句的每一词
                wordNum = wordNum + 1
                if (word in stop):  #为停用词则跳过
                    continue
                if (word.isdecimal()):  #数字归为“数字“,减少冗余
                    myDiction['数字'] = myDiction.get('数字', 0) + 1
                elif (word in ['##']):  #连在一起的#记作两个#
                    myDiction['##'] = myDiction.get('##', 0) + 2
                elif (word in repost):  #转发格式归为“转发“,减少冗余
                    myDiction['转发'] = myDiction.get('转发', 0) + 1
                else:  #其他词汇则按自身处理
                    myDiction[word] = myDiction.get(word, 0) + 1
        temp = []
        for key in myDiction.keys():  #遍历处理该组的字典
            temp.append([key, myDiction[key]])  #转换为list
        myDiction = pd.DataFrame(data=temp, columns=['word',
                                                     lables[i]])  #转换为df
        myDiction[
            lables[i]] = myDiction[lables[i]] / wordNum * 1000  #将频数转换后放大后的频率值
        diction.append(myDiction)

    myDiction = pd.merge(diction[0], diction[1],
                         how='inner')  #将三组的df按交集合并为一个df
    myDiction = pd.merge(myDiction, diction[2], how='inner')
    rating = []
    for index, row in myDiction.iterrows():
        retList = getMaxStdDev([row[lables[i]] for i in range(3)])
        if (retList[0] == 0):
            continue
        rating.append([row['word'], retList[0], retList[1]])
    tempDf = pd.DataFrame(data=rating, columns=['word', 'neu/neg', 'pos/neg'])
    tempDf.sort_values(by=['neu/neg', 'pos/neg'],
                       ascending=[False, False],
                       inplace=True)  # 分数名列前茅者作为特征词
    tempDf.reset_index(drop=True, inplace=True)
    classID = -1
    oneKeep = -1
    twoKeep = -1
    refoIDList = []
    classIDList = []
    for index, row in tempDf.iterrows():
        if (row[1] != oneKeep):
            classID = classID + 1
            oneKeep = row[1]
            twoKeep = row[2]
        elif (row[2] != twoKeep):
            classID = classID + 1
            twoKeep = row[2]
        refoIDList.append(index)
        classIDList.append(classID)
    classIDList = pd.DataFrame(classIDList)
    refoIDList = pd.DataFrame(refoIDList)
    classIDList = pd.concat([refoIDList, classIDList], axis=1)
    tempDf = pd.concat([classIDList, tempDf], axis=1)
    with open('key.csv', 'w', newline='', encoding='UTF_8_sig') as f:
        writer = csv.writer(f)
        for index, row in tempDf.iterrows():
            writer.writerow(row)
    return tempDf
예제 #41
0
import jieba.posseg as pseg
import re
import time
import pandas as pd
import numpy as np
import os
import tqdm
mp=os.getcwd()

Dir=mp+'\data\Dir.txt'
Fpath=mp+'\data\StopWords.txt'


time_start=time.time()

jieba.load_userdict(Dir)
# 创建停用词list
def stopwordslist(filepath):
    stopwords = [line.strip() for line in open(Fpath, 'r', encoding='utf-8').readlines()]
    return stopwords


# 对句子进行分词
def seg_sentence(sentence):
    sentence1 = sentence.lower()#大写转小写
    sentence_seged = jieba.cut(sentence1.strip())
    stopwords = stopwordslist(Fpath)  # 这里加载停用词的路径
    outstr = ''
    for word in sentence_seged:
        if word not in stopwords:
            if word != '\t':
예제 #42
0
import urllib.request
from bs4 import BeautifulSoup
from tqdm import tqdm
from collections import Counter
from scipy.misc import imread

# 画图 显示中文
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
"""
①对excel中的数据进行基本的清洗,去燥,去重
②对标题title 和帖子内容 main_body 分词-jieba 精确分词
"""

######### 分词 ##############
jieba.load_userdict("userdict.txt")

data_raw = pd.read_excel("./data/汽车之家论坛.xlsx")

print(data_raw.shape)
data_raw = data_raw.drop_duplicates()
data_raw = data_raw[~data_raw['main_body'].isin([None, " "])]
data_raw = data_raw.fillna(0)


def readLines(filename):
    # 读文件的方法
    out = []
    with open(filename, 'r', encoding='utf-8') as fr:
        lines = fr.readlines()
        for line in lines:
예제 #43
0
# coding:utf8
"""
Description:自定义去停用词
Author:伏草惟存
Prompt: code in Python3 env
"""

import re, jieba, sys

# 加载自定义分词词典
jieba.load_userdict("../dataSet/StopWord/user_dict.txt")

#********************1 结巴中文分词***********************************


# 利用jieba对文本进行分词,返回切词后的list
def seg_doc(str_doc):
    # 1 正则处理原文本
    sent_list = str_doc.split('\n')
    # map内置高阶函数:一个函数f和list,函数f依次作用在list.
    sent_list = map(textParse, sent_list)  # 正则处理,去掉一些字符,例如\u3000

    # 2 获取停用词
    stwlist = get_stop_words()

    # 3 分词并去除停用词
    word_2dlist = [
        rm_tokens(jieba.cut(part, cut_all=False), stwlist)
        for part in sent_list
    ]
예제 #44
0
# -*- coding: utf-8 -*-
# !/usr/bin/pythonrrrr

from __future__ import print_function, unicode_literals
import sys
sys.path.append("../")
import jieba
import jieba.posseg as pseg

print("=== 自定义分词器 ===")

jieba.load_userdict("../../../resource/jieba/userdict.txt")

print("======= 词性标注 =======")
jieba.add_word("凯特琳", tag="nr")
jieba.add_word("李铁军", tag="nr")
test_send = ("李小福和李铁军是创新办主任也是云计算方面的专家;什么是八一双鹿\n"
             "例如我输入一个带'韩玉赏鉴'的标题,在自定义词类中也增加了此词为N类\n"
             "[台中]正确应该不会被切开。mac中可分出[石墨烯];此时又可以分出来凯特琳了。")
words = jieba.cut(test_send)
print("/".join(words))
print("=" * 40)

result = pseg.cut(test_send)
for w in result:
    print(w.word, "/", w.flag, ", ", end='')
print("=" * 40)

terms = jieba.cut("easy_install as great")
print("/".join(terms))
예제 #45
0
# -*- coding: utf-8 -*-
import os
import math
import jieba
from collections import Counter
from .save_to_redis import save_to_redis
from .config import stop_words

abs_path = os.path.abspath(os.path.dirname(__file__))
# 加载自己定义的字典
jieba.load_userdict(abs_path + '/dict/dict.txt')


class TfIDf(object):
    def __init__(self, rows, all_comment=None, all_zan=None, all_time=None):
        """
        :param rows: list, 数据集合
        :param all_comment: list, 评论集合 -> [2, 2, 5, 7]
        :param all_zan:  list, 点赞集合 -> [1, 5, 0, 3]
        :param all_time: list, 发布时间集合 -> ['2016-10-31 01:37', '2016-11-01 11:13']
        """
        self.abs_path = os.path.abspath(os.path.dirname(__file__))
        self.rows = rows
        self.all_time = all_time
        self.all_comment = all_comment
        self.all_zan = all_zan
        self.temp_contents = []
        self.total_seg_list = []
        self.tf_dict = {}
        self.tf_idf_dict = {}
        self.tf_number = {}  # 关键字的频数,没有归一化前的数据
예제 #46
0
Author: shileicao([email protected])
Date: 17-2-4 下午8:51
"""
import json
import os
import random
import sys

import jieba
import w2v

from cnn_clfier import C_MAX_SENTENCE_LEN, C_MAX_WORD_LEN

SPLIT_RATE = 0.8

jieba.load_userdict(os.path.join('../data', 'words.txt'))


def tokenizer(sentence):
    return jieba.lcut(sentence, cut_all=False)


def stat_max_len(data):
    max_word_len = 0
    max_sentence_len = 0
    for key in data:
        for sentence in data[key]:
            temp_max_word_len = max(
                [len(word) for word in tokenizer(sentence)])
            temp_max_sentence_len = len(tokenizer(sentence))
            if max_word_len < temp_max_word_len:
예제 #47
0
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras import regularizers
import sys, csv, os, time
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.models import load_model
from keras.layers import GRU, LSTM, Bidirectional

start = time.time()
# import tensorflow as tf
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# sess = tf.Session(config = config)

jieba.load_userdict(sys.argv[2])
f = pd.read_csv(sys.argv[1])
test_x = f['comment']
test_x = np.array(test_x)
for i in range(test_x.shape[0]):
    test_x[i] = jieba.lcut(test_x[i])
test_x = np.array(test_x)

w2vmodel = Word2Vec.load('word2vec.model')
tx2 = np.zeros((len(test_x), 40, 250))
for i in range(len(test_x)):
    # x = np.zeros((40, 250))
    if len(test_x[i]) > 40:
        for j in range(40):
            try:
                tx2[i][j] = w2vmodel[test_x[i][j]]
예제 #48
0
파일: divide.py 프로젝트: cszpf/lvshou
def getMainSentence(role, number=500):
    '''
    提取文本主干,跟实际应用无关
    :param role:
    :param number:
    :return:
    '''
    def fenci(x):
        '''
        对文本进行分词
        :param x: 文本
        :return:
        分词之后的文本列表
        '''
        return ' '.join(
            [word for word in jieba.cut(x) if word not in (' ', '\n')])

    path1 = 'Tokens.csv'
    path2 = 'MainToken.csv'
    path3 = 'features.pk'
    if os.path.exists(path2):
        data = pd.read_csv(path2)
        with open(path3, 'rb') as fr:
            _vocab = pk.load(fr)
    else:
        if os.path.exists(path1):
            data = pd.read_csv(path1)
        else:
            jieba.load_userdict('setting/userdict1.txt')
            data = getAllSentence()
            data1 = data['sentenceList'].apply(str).apply(eval)
            data1 = data1.apply(lambda x: '\n'.join(
                [i['content'] for i in x if i['role'] == role]))
            data1 = data1.apply(lambda x: fenci(x))
            data['Tokens'] = data1
            del (data1)
            data.to_csv(path1, index=False)
        data = data.sample(4000)[['UUID', 'Tokens']]
        # print(list(data.sample(40)['Tokens']))
        # return
        # 提取特征词
        vec = CountVectorizer(ngram_range=(1, 3),
                              min_df=3,
                              max_df=0.9,
                              max_features=20000)
        vec.fit_transform(list(data['Tokens']))
        _vocab = [i for i in vec.vocabulary_.keys() if ' ' not in i]
        del (vec)
        print('已经训练好词表')
        # pattern = re.compile(r'{}'.format(' |'.join(_vocab)))
        data = data.sample(number)
        data.to_csv(path2, index=False)
        with open(path3, 'wb') as fw:
            pk.dump(_vocab, fw)
    data = data['Tokens']
    _data = []
    for i in list(data):
        temp = []
        for j in i.split(' '):
            if j in _vocab:
                temp.append(j)
        _data.append(' '.join(temp))
        temp = None
        # _data.append(''.join(pattern.findall(i)))
        i = None

    data = pd.DataFrame()
    data['MainToken'] = _data
    del (_data)
    data.to_csv('main.csv', index=False)
예제 #49
0
    data_to_token_ids(src_train_path, src_train_ids_path, vocab_path,
                      tokenizer)
    data_to_token_ids(dest_train_path, dest_train_ids_path, vocab_path,
                      tokenizer)

    # 创建 token ids for the development data.
    src_dev_ids_path = os.path.join(dev_path, "content_dev_id")
    dest_dev_ids_path = os.path.join(dev_path, "title_dev_id")
    data_to_token_ids(src_dev_path, src_dev_ids_path, vocab_path, tokenizer)
    data_to_token_ids(dest_dev_path, dest_dev_ids_path, vocab_path, tokenizer)


if __name__ == '__main__':
    content_fp = root_path + '/corpus_50.txt'
    title_fp = root_path + '/corpus_title_50.txt'
    jieba.load_userdict(root_path + '/dict.txt')
    print(content_fp)

    train_path = os.path.join(root_path, "train")
    src_train_path = os.path.join(train_path, "content-train-origin.txt")
    dest_train_path = os.path.join(train_path, "title-train-origin.txt")

    dev_path = os.path.join(root_path, "dev")
    src_dev_path = os.path.join(dev_path, "content-dev-origin.txt")
    dest_dev_path = os.path.join(dev_path, "title-dev-origin.txt")

    test_path = os.path.join(root_path, "test")
    src_test_path = os.path.join(test_path, "content-test-origin.txt")
    dest_test_path = os.path.join(test_path, "title-test-origin.txt")

    # step1 获取出文本内容
예제 #50
0
파일: wjieba.py 프로젝트: wooght/HandBook
print("FullMode:" + "/".join(seg))

#cut_all=False 精确模式 默认
seg = jieba.cut("我毕业于攀枝花学院计算机系", cut_all=False)
print("FullMode:" + "/".join(seg))

seg = jieba.cut("我毕业于攀枝花学院计算机系")
print("FullMode:" + "/".join(seg))

#cut_for_search 搜索引擎模式
seg = jieba.cut_for_search("我毕业于攀枝花学院计算机系")
print("Search Mode:" + "/".join(seg))

print(sys.path)
#引入自定义字典
jieba.load_userdict(
    "F:\homestead\handbook\python\decision_tree\jieba_words\words.txt")

seg_list = jieba.cut("蒲文锋是python爬虫砖家也是云计算方面的专驾。")
print("Origin: " + "/".join(seg_list))

#动态调整词典
jieba.add_word("专驾")
#del_word()删除单词

print("/".join(jieba.cut("如果放到POST中将出错", HMM=False)))

#调节词频 使中,将可以分出来
jieba.suggest_freq(("中", "将"), tune=True)  #tune True 分出来,False不单独分出来
print("/".join(jieba.cut("如果放到POST中将出错", HMM=False)))

Original = "/".join(jieba.cut("遂宁市涪江大桥参加了沱江大桥的通车仪式。", HMM=False))
예제 #51
0
import os
import nltk
import numpy as np
import pickle
import random
import re
import jieba
import datetime
import random

module_path = os.path.dirname(__file__)  #文件所在脚本的相对路径
#当前文件所在目录的上级目录
pre_dir = os.path.join(module_path, '..')  #相对路径的上级路径
#自定义词典要放在并行化程序之前,否则不起作用
jieba.load_userdict(os.path.join(pre_dir, 'data/self_define_dict.txt'))

#nltk.download()
padToken, goToken, eosToken, unknownToken = 0, 1, 2, 3


class Batch:
    #batch类,里面包含了encoder输入,decoder输入,decoder标签,decoder样本长度mask
    def __init__(self):
        self.encoder_inputs = []
        self.encoder_inputs_length = []
        self.decoder_targets = []
        self.decoder_targets_length = []


def loadDataset(filename):
예제 #52
0
import jieba

jieba.load_userdict('./dict.txt')
jieba.suggest_freq('奥利给', True)
jieba.suggest_freq('奥利奥', True)
예제 #53
0
Name: Study
Description:本文档为验证地址识别和核验的业务逻辑编写
"""
'''
对地址进行分段,地址分为宏观部分和微观部分。
宏观部分:决定房屋的GIS坐标,故最小粒度可能为:街道名称/门牌号/小区名称/街道符号
微观部分:决定房屋在小区内部的位置,基本格式为:*栋*单元*楼层*房号
本次测试案例暂以成都区域登记地址为对象,暂不考虑简阳地址。
'''

# Step1:将地址拆分为宏观和微观部分。
# 拆分规则(待确定):1)字段中包含:a。附*号,b. *号

import jieba

jieba.load_userdict('/Applications/jieba-0.39/User_Define.txt')  # 引用本地词库


# 定义检查函数,检查地址中是否包含既定规则的字符,如果包含,拆分字符,如果不包含,提示检查地址
def address_first_cut(sen):
    import re
    global sen1, sen2, sen3
    # 构建规则库,构建规则库时,可以考虑规则出现的频度,将最多的规则放在前面遍历
    address_rule = (r'附\d{1,5}号', r'街\d{1,5}号', r'路\d{1,5}号', r'段\d{1,5}号',
                    r'道\d{1,5}号', r'环\d{1,5}号')
    if len(address_rule) == 0:  # 检查规则库是否为空
        sen1 = 'false'
        sen2 = '请检查'
        sen3 = '规则'
    else:
        i = 0
예제 #54
0
import jieba
import codecs

# 参考
# 原文链接:https://blog.csdn.net/oxuzhenyi/article/details/55511138

names = {}  # 姓名字典
relationships = {}  # 关系字典
lineNames = []  # 每段内人物关系

# 将人名存入列表
with codecs.open("persons.txt", "r", "utf-8") as f:
    persons = f.read().splitlines()

# 每段每个人物及出现次数
jieba.load_userdict("persons.txt")  # 载入自定义词典


def count_names(f):
    for line in f.readlines():
        # 分词并返回该词词性
        name = jieba.cut(line)
        # 为新读入的一段添加人物名称列表
        lineNames.append([])
        for w in name:
            # 如果不在人名列表里,则不为人名
            if w not in persons:
                continue
            # 为当前段的环境增加一个人物
            lineNames[-1].append(w)
            if names.get(w) is None:
예제 #55
0
#coding=utf-8
import jieba
import json
from math import log
import os
jieba.load_userdict('NameDict_Ch_v2.txt')
file = open("questions_example.json")
data = json.load(file)
inverted_index = {}
fixed_index = {}
resume_index = []
DOC_NUM = 20
DIR = './split/'
WIKI_NUM = 920017


def create_index(words, question):
    for word in words:
        if word not in inverted_index:
            inverted_index[word] = {}
        if str(question) not in inverted_index[word]:
            inverted_index[word][str(question)] = 0
        inverted_index[word][str(question)] += 1


def tf(query, doc):
    for word, docs in inverted_index.items():
        if query == word:
            return docs[doc]

예제 #56
0
# encoding=utf-8

"""
评测数据格式:  行号\t句1\t句2
本程序作用: 将每行句子分词处理
"""

import jieba
stopwords = [word.decode("utf8").strip() for word in open("../dictionary/stopwords.txt", 'r').readlines()]
jieba.load_userdict("../dictionary/dic.txt")


def remove_stopwords(segmented_list):
    out_str = ""
    for word in segmented_list:
        if word not in stopwords and word != " ":
            out_str += word
            out_str += ","
    return out_str[:-1]


def pretreat_sentence(sentence):
    seg_list = jieba.cut(sentence)
    final_seg_str = remove_stopwords(seg_list)
    return final_seg_str


def process(new_num, content):
    (_, s1, s2, tag) = content.decode("utf8").split("\t")
    new_num = '"' + str(new_num) + '",'
    new_s1 = '"' + pretreat_sentence(s1) + '",'
예제 #57
0
    def forward(self, output_q, output_s):
        # Similarity Score
        # (B, 5F)
        b = output_q.size(0)
        score = self.translate(output_q)
        # (B, 1)
        score = torch.bmm(output_s.view(b, 1, 5 * args.num_filters),
                          score.view(b, 5 * args.num_filters, 1))
        # (B, 1)
        score = score.squeeze(1)
        return score


# Data PipeLine
jieba.load_userdict(DATA_DIR + "/cut_dict_uniq.txt")
STOPWORDS = [
    line.strip()
    for line in codecs.open(DATA_DIR +
                            "/stopwords_1009.txt", "r", "utf-8").readlines()
]


# =================================
#  Char Data Loader (Embedding)
# =================================
def load_embedding():
    char_vectors = KeyedVectors.load_word2vec_format(DATA_DIR +
                                                     "/embedding_char_300.bin",
                                                     binary=True)
    char2index = {}
예제 #58
0
import pandas as pd
import tensorflow as tf
from itertools import chain
import sys
import numpy as np
import jieba
import keras
import keras.backend as K
from collections import defaultdict
from gensim.models.word2vec import Word2Vec
import gensim
from keras.layers import Dense, Embedding, LSTM, TimeDistributed, Input, Bidirectional, GRU, recurrent, Reshape
from keras.models import Model
from sklearn.model_selection import train_test_split
jieba.load_userdict('../data/keyword.txt')

f = pd.read_excel('../data/new1000.xlsx', header=0)
# f = pd.read_excel('../data/30.xlsx', header=0)
# f = pd.read_csv('data/data.csv', header=None, sep=',')
# f = open('data/smartPatent_20180512.xlsx', 'r')
stopwordfile = open('../data/StopWords_con.txt', 'r')


def getstopwordset():
    w = set()
    for lines in stopwordfile:
        arr = lines.split('\n')
        # print type(arr[0].decode('utf-8'))
        w.add(arr[0].decode('utf-8'))
    return w
예제 #59
0
#方法2
import jieba
from jieba.analyse import extract_tags
import glob

import pandas as pd
df = pd.DataFrame(columns=["Context"])

for cut_word in glob.glob("DataCleaning/*.txt"):
    print("Now, we process the file is : ", cut_word)

    # 讀取DataCleaning資料夾的所有檔案,並且斷詞。
    with open(cut_word, "r", encoding="utf-8") as f:
        article = f.read()
        # 準備詞庫
        jieba.load_userdict("dict.txt.big")

        # 若斷詞過後,與我們想要的詞有些微出入,可自行創建詞庫。
        jieba.load_userdict("food_dict.txt")

        # 計算關鍵詞時,若有出現我們、和、的...等無關簡要的關鍵詞,則這些詞需建立在stop_words.txt,讓程式自動忽略,
        # 以利後續的資料分析。

        jieba.analyse.set_stop_words("stop_words.txt")

        word_cut = " ".join(jieba.cut(article))
        print(word_cut)
        print(extract_tags(article, 10))
        print("\n")

    # 做好的斷詞寫進去檔案
예제 #60
0
 def __init__(self):
     jieba.load_userdict('../coreEntityEmotion_baseline/models/nerDict.txt')