def testReference(self): import jieba # May fail to load jieba jieba.initialize(usingSmall=False) import jieba.posseg as pseg pwords = [] content = u'上海今日新确诊3例人感染H7N9禽流感病例' _ = """ ns 上海 t 今日 a 新 v 确诊 m 3 n 例人 v 感染 eng H7N9 n 禽流感 n 病例 """ content = u'李克强:在半岛挑事无异于搬石头砸自己脚' _ = """ nr 李克强 p 在 n 半岛 v 挑事 l 无异于 v 搬 l 石头砸 r 自己 n 脚 """ for word in pseg.cut(content): print word.flag, word.word
def __init__(self, dics = {}): self.word_dic = dics self.fcounter = 0 self.default_idf = 10 self.log_base = math.e self.rubbish_set, self.rubbish_hd = self.get_rubbish_set() jieba.initialize()
def serve(filename): if os.path.exists(filename): try: receive(filename, b'["ping"]\n') return except: # not removed socket print("Found abandoned socket") os.unlink(filename) try: with socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) as sock: sock.bind(filename) sock.listen(5) jieba.initialize() while 1: conn, addr = sock.accept() received = conn.recv(1024) while received[-1] != 10: received += conn.recv(1024) result = handle(received.decode('utf-8')) if result is None: conn.sendall(b'\n') elif result == b'stop': conn.sendall(b'\n') conn.close() break else: conn.sendall(result + b'\n') conn.close() finally: if os.path.exists(filename): os.unlink(filename) print("Server stopped.")
def __init__(self): self.word_to_pinyins = defaultdict(list) f = open(FILE_WORDS, 'rb') for line in f: pinyin, words = line.strip().decode("utf-8").split() for item in words: self.word_to_pinyins[item].append(pinyin) f.close() self.word_to_pinyin = {} f = open(FILE_WORD, 'rb') for line in f: word, pinyin = line.strip().decode("utf-8").split(",") self.word_to_pinyin[word] = pinyin f.close() self.term_to_pinyin = {} f = open(FILE_TERM, 'rb') for line in f: term, pinyin = line.strip().decode("utf-8").split("#") self.term_to_pinyin[term] = pinyin.split("@") f.close() f = open(FILE_USER_DICT, 'rb') jieba.setLogLevel(logging.INFO) jieba.initialize() jieba.load_userdict(f) f.close()
def word_list(path = conf.output_dir + "/tmp/"): jieba.initialize() jieba.load_userdict("./user_dict.txt") print "cutting words" dict = {} f = open(path+"/all_json.txt", "r") i = 0 for line in f: if (i %100) == 0: sys.stderr.write(str(i) + "\n") i += 1 json_obj = json.loads(line) danmu = json_obj['ci'] for k in danmu.keys(): words_list = danmu[k] word = jieba.cut(words_list) for w in list(word): if w in dict.keys(): dict[w] += 1 else: dict[w] = 1 f.close() out = codecs.open(path + "words.txt", "wb", "utf-8") for k in dict.keys(): out.write(k) out.write(" ") out.write(unicode(dict[k])) out.write("\n") out.close()
def get(self, keyword): pages = [] spages = [] words = [] if keyword: import jieba # May fail to load jieba jieba.initialize(usingSmall=True) words = list(jieba.cut(keyword, cut_all=False)) words = [ word for word in words if len(word) > 1 ] # words = list(jieba.cut_for_search(keyword)) keyword = stringutil.parseUnicode(keyword) pages = snapi.getAllPages() pages = globalutil.search(pages, words) globalutil.populateSourceUrl(pages) twitterAccount = globalconfig.getTwitterAccount() spages = bs.search(words[0], twitterAccount) templateValues = { 'keyword': keyword, 'pages': pages, 'spages': spages, 'words': words, } self.render(templateValues, 'search.html')
def load(self): # load jieba first if not jieba.initialized: jieba.set_dictionary(self.jieba_dict_path) jieba.initialize() self.pydict = {} f = None try: # py.txt f = open(self.dict_path) for line in f: try: line = line.strip() except: continue sps = line.split('\t') if len(sps) != 3: print >>sys.stderr, 'bad format line [%s]' % line continue word = sps[0] py = sps[1] freq = float(sps[2]) if word in self.pydict: wordInfoLen = len(self.pydict[word]) i = 0 dup = False while i < wordInfoLen: if self.pydict[word][i].py == py: if self.pydict[word][i].freq < freq: self.pydict[word][i].freq = freq dup = True break if self.pydict[word][i].freq < freq: break i += 1 if not dup: pyInfo = PyInfo() pyInfo.py = py pyInfo.freq = freq self.pydict[word].insert(i, pyInfo) wordInfoLen += 1 for j in range(i + 1, wordInfoLen): if self.pydict[word][j].py == py: del self.pydict[word][j] break else: pyInfo = PyInfo() pyInfo.py = py pyInfo.freq = freq self.pydict[word] = [ pyInfo ] except Exception as e: try: f.close() except: pass return False self.is_load = True return True
def __init__(self, model=None, model_file=None): if model: self.pipeline, self.label_encoder = model elif model_file: self.load_model(model_file) else: raise Exception("param model or model_file should be passed") jieba.initialize() logging.info("predictor init sucessfully.")
def __init__(self): # 取得当前包路径 _package_path_ =_context_path self._user_dict = _package_path_+os.sep+"dic.data" self._user_stword = _package_path_+os.sep+"stword.data" #构造停用词列表 self._stop_word_list = list(line.strip().decode("utf8") for line in open(self._user_stword,'r').readlines()) # print(self._user_dict,self._user_stword) jieba.set_dictionary(self._user_dict) jieba.initialize()
def split(toCut): jieba.initialize() toCut = unicode(toCut.decode("gbk")) retList = list(jieba.analyse.extract_tags(toCut, topK=255, withWeight=1)) retList = [i for i in retList if i[0] != " "] for i in range(0, len(retList)): retList[i] = [retList[i][0].encode("GBK"), retList[i][1]] return retList
def participle(self): jieba.set_dictionary("dict/dict.txt") jieba.initialize() if(self.radioButton.isChecked()): self.result=jieba.cut(self.filetext,cut_all=True) elif(self.radioButton_2.isChecked()): self.result=jieba.cut(self.filetext,cut_all=False) elif(self.radioButton_3.isChecked()): self.result=jieba.cut_for_search(self.filetext) else: self.result=jieba.cut(self.filetext,cut_all=False) self.textBrowser.clear() self.textBrowser.setText('/'.join(self.result))
def __init__(self, processnum=1): logger.info('Initializing jieba...') jieba.initialize() logger.info('Successfully initialized jieba.') if processnum == 0: processnum = multiprocessing.cpu_count() if processnum > 1: logger.info( 'jieba running in parallel mode with %d processes.', processnum ) jieba.enable_parallel(processnum)
def _getTopWords(psegs, titles, stopWordPatterns, stopWords, userDict): content = '\n'.join(titles) import jieba # May fail to load jieba if psegs: jieba.initialize(usingSmall=True) import jieba.posseg as pseg pseg.loadDictModel(usingSmall=True) pwords = [] flags = psegs for word in pseg.cut(content): if word.flag not in flags: continue pwords.append(word.word) else: jieba.initialize(usingSmall=False) if userDict: jieba.load_userdict_items(userDict) pwords = jieba.cut(content, cut_all=False) words = [] for word in pwords: # sometime "\r\n\n" encountered word = word.strip() if not word: continue if word in stopWords: continue if _isStopWord(stopWordPatterns, word): continue words.append(word) words.sort() lastWord = None lastCount = 0 result = [] _MIN_WORD_COUNT = 2 for word in words: if lastWord != word: if lastCount >= _MIN_WORD_COUNT: result.append({'name': lastWord, 'count': lastCount}) lastWord = word lastCount = 0 lastCount += 1 if lastCount >= _MIN_WORD_COUNT: result.append({'name': lastWord, 'count': lastCount}) result.sort(key=lambda item: len(item['name']), reverse=True) result.sort(key=lambda item: item['count'], reverse=True) return [ item['name'] for item in result ]
def initialize(): # Load conjuction data global CONJUNCTIONS CONJUNCTIONS = [] from codecs import open with open('vendor/moedict.dict', 'r', encoding='utf8') as data: for entry in data: CONJUNCTIONS.append(entry.split()[0]) # Load CJK parsing library jieba.set_dictionary('vendor/jieba_tc.dict') jieba.load_userdict('vendor/chewing.dict') jieba.initialize()
def run(): start_time = time.clock() jieba.set_dictionary('jieba/dict.txt.big') jieba.initialize() print ("jieba " + str(time.clock() - start_time)) start_time = time.clock() news_rss_url = "http://hk.news.yahoo.com/rss/hong-kong" # news_rss_url = "http://hk.news.yahoo.com/rss/china" info = feedparser.parse(news_rss_url) start_time = time.clock() for entry in info.entries: # word count of each word of summary word_list = getBagOfWords(preprocess(jieba.cut(stripTag(entry.summary)))) # word count of each word of title bag_of_word_of_title = getBagOfWords(preprocess(jieba.cut(stripTag(entry.title)))) # Combine word count of both summary and title and title weights more bag_of_word = Counter() for i in range(3): bag_of_word.update(bag_of_word_of_title) bag_of_word.update(word_list) entry["bag_of_words"] = bag_of_word print ("preprocess " + str(time.clock() - start_time)) # result = Counter() # for entry in info.entries: # result.update(entry["bag_of_words"]) # printList(result) # Clustering them start_time = time.clock() clusters = clustering.clustering([Cluster([Vector(entry)]) for entry in info.entries]) print ("clustering " + str(time.clock() - start_time)) # Print the result newsList = [] for (index, cluster) in enumerate(clusters): for vector in cluster.listOfVectors: news = News(index, (vector == cluster.centroidVector), vector.data["title"], vector.data["published"], vector.data["link"]) newsList.append(news.__dict__) return json.dumps(newsList)
def __init__(self, dics = None): ''' fname_dic = {fid:set([word list])} word_dic = {'word':{fid:tf, ...}} ''' if dics: self.set_vars_by_dics(dics) else: self.word_dic = {} self.fname_dic = {} self.fcounter = 0 self.default_idf = 0 self.log_base = math.e self.rubbish_set = set() self.proportion = 0.3 jieba.initialize()
def get(self, eventScope, eventId): event = models.getEvent(eventScope, eventId) if not event: self.error(404) return event["pages"].sort(key=lambda page: page.get("published") or page["added"], reverse=True) if "keyword" in self.extraValues: import jieba # May fail to load jieba jieba.initialize(usingSmall=True) words = list(jieba.cut(self.extraValues["keyword"], cut_all=False)) for page in event["pages"]: page["grade"] = 0 for word in words: if len(word) <= 1: continue if stringutil.contains(page.get("title", ""), word): page["grade"] += len(word) event["pages"].sort(key=lambda page: page["grade"], reverse=True) templateValues = {"event": event} self.render(templateValues, "event.html")
def visit_offcanvas(request): #bug: 同个客户端同时刷新好几次,可能同时返回导致内容混合 ip = None if request.META.has_key('HTTP_X_FORWARDED_FOR'): ip = request.META['HTTP_X_FORWARDED_FOR'] else: ip = request.META['REMOTE_ADDR'] logger.info("%s BEGIN. POST:%s, GET:%s"%(ip,str(request.POST),str(request.GET))) global is_first_load mutex_update_news.acquire() if is_first_load: #print "[LOG %s] init news."%(time.strftime("%Y-%m-%d %X", time.localtime())) logger.info("init news.") if platform.system() == "Linux": jieba.enable_parallel(8) jieba.initialize() #jieba.set_dictionary('data/dict.txt.big') update_base() init_news2() thread.start_new_thread(thread_update_news, ("",)) is_first_load = False mutex_update_news.release() queryDict=None if request.method == 'GET': queryDict = request.GET elif request.method == 'POST': queryDict = request.POST jsondata = get_jsondata(queryDict) fp = open('django_composite/offcanvas.html') t = Template(fp.read()) fp.close() html = t.render(Context(jsondata)) logger.info("%s END."%ip) return HttpResponse(html) '''
def __init__(self): self.br = br = mechanize.Browser() self.br.set_handle_robots(False) # ignore robots self.br.set_handle_refresh(False) self.sixHourBeforeTime = time.time() - 60 * 60 * 6 self.db_address = "127.0.0.1" #'54.251.147.205' if platform.system() == "Windows": self.features = "html5lib" else: self.features = "lxml" oauth_args = dict( client_id="482698495096073", client_secret="8c58b055fcb762a9780638dc401c85e2", grant_type="client_credentials", ) oauth_curl_cmd = ["curl", "https://graph.facebook.com/oauth/access_token?" + urllib.urlencode(oauth_args)] oauth_response = subprocess.Popen(oauth_curl_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[ 0 ] print oauth_curl_cmd print str(oauth_response) try: oauth_access_token = urlparse.parse_qs(str(oauth_response))["access_token"][0] self.graph = facebook.GraphAPI(oauth_access_token) except KeyError: print ("Unable to grab an access token!") # self._pre_dict_combine('combine_dict.txt') # jieba.set_dictionary('combine_dict.txt') dict_path = os.path.dirname(os.path.abspath(__file__)) + "/dict.txt" print dict_path jieba.set_dictionary(dict_path) jieba.initialize()
def __init__(self, stop_file=None, use_tfidf=False): self.stop_words = ["", " "] if stop_file: with open(stop_file, 'r') as rf: tokens = rf.readlines() tokens = [t.strip().decode("u8") for t in tokens] self.stop_words.extend(tokens) print "*****" logging.info("load %d stop words" % len(self.stop_words)) jieba.initialize() self.label_encoder = LabelEncoder() if use_tfidf: self.pipeline = Pipeline([ ('vec', CountVectorizer(stop_words=self.stop_words)), ('feat', TfidfTransformer()), #('clf', SGDClassifier()) ('clf', MultinomialNB()) ]) else: self.pipeline = Pipeline([ ('vec', CountVectorizer(stop_words=self.stop_words, binary=True)), ('clf', BernoulliNB(fit_prior=True)) ]) logging.info("init the classifier")
print >> log_f, w.encode("utf-8"), "/" , print 'speed' , len(content)/tm_cost, " bytes/second" 实验结果:在4核3.4GHz Linux机器上,对金庸全集进行精确分词,获得了1MB/s的速度,是单进程版的3.3倍。 其他词典 占用内存较小的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.small 支持繁体分词更好的词典文件 https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big 下载你所需要的词典,然后覆盖jieba/dict.txt 即可或者用jieba.set_dictionary('data/dict.txt.big') 模块初始化机制的改变:lazy load (从0.28版本开始) jieba采用延迟加载,"import jieba"不会立即触发词典的加载,一旦有必要才开始加载词典构建trie。如果你想手工初始jieba,也可以手动初始化。 import jieba jieba.initialize() # 手动初始化(可选) 在0.28之前的版本是不能指定主词典的路径的,有了延迟加载机制后,你可以改变主词典的路径: jieba.set_dictionary('data/dict.txt.big') 例子: #encoding=utf-8 import sys sys.path.append("../") import jieba def cuttest(test_sent): result = jieba.cut(test_sent) print " ".join(result) def testcase(): cuttest("这是一个伸手不见五指的黑夜。我叫孙悟空,我爱北京,我爱Python和C++。") cuttest("我不喜欢日本和服。")
def JIEBAInit(file_path='OpinionAnalysis/dict/'): logging.info('Loading dictionary and initializing jieba...') jieba.set_dictionary(file_path + 'dict.txt.big') jieba.load_userdict(file_path + 'userdict.txt') jieba.initialize()
line_name=name line={'x':line_x,'y':line_y,'name':line_name} # print(line_x) # print(line_y) result={'pie':pie,'bar':bar,'line':line} ##数据多了主题提取可能比较慢所以用子线程通过全局变量返回结果 ##start subthread for lda analyse try: _thread.start_new_thread( subthread_lda_analyse, ("Thread-lda", comments, ) ) except: print ("Error: can not start thread") return 'result generate success' return 'flask not get name' if __name__=='__main__': print('jieba initializing...') jieba.initialize() print('loading fasttext model...') fasttext_model=fasttext.load_model(fasttext_model_path) print('loading lda model...') model_path='./train_model/lda_model/LDA_model' lda_model = gensim.models.ldamodel.LdaModel.load(model_path) radar_x=[] radar_y=[] result={} app.run("0.0.0.0",threaded=True)
def jieba_initialize(): if not platform.system().upper().startswith("WINDOWS"): jieba.enable_parallel(multiprocessing.cpu_count()) jieba.load_userdict('resources/QAattrdic.txt') jieba.initialize()
import jieba.posseg posdelim = args.pos def cutfunc(sentence, _, HMM=True): for w, f in jieba.posseg.cut(sentence, HMM): yield w + posdelim + f else: cutfunc = jieba.cut delim = text_type(args.delimiter) cutall = args.cutall hmm = args.hmm fp = open(args.filename, 'r') if args.filename else sys.stdin if args.dict: jieba.initialize(args.dict) else: jieba.initialize() if args.user_dict: jieba.load_userdict(args.user_dict) ln = fp.readline() while ln: l = ln.rstrip('\r\n') result = delim.join(cutfunc(ln.rstrip('\r\n'), cutall, hmm)) if PY2: result = result.encode(default_encoding) print(result) ln = fp.readline() fp.close()
class WordSegmentation(object): ''' 分词 ''' stop_words_file = {}.fromkeys([line.decode('utf8').strip() for line in open(util_path.stop_words_path)]) # 加载停用词典 jieba.set_dictionary(util_path.jieba_dict_path) # 加载专业词jieba词典 jieba.initialize() def addotherdics(self): dfolder=util_path.otherdict_folder othdpath=[os.path.join(dfolder,i) for i in os.listdir(dfolder)] for inf in othdpath: print("add words from %s" %inf) with codecs.open(inf,'rU',encoding='utf8') as f: for w in f: w=w.strip() if w: jieba.add_word(w) def segment(self, sent, stop_words_file=stop_words_file, mode='normal',addotherdic=False): """ 对输入文本进行分词处理,可选择性加载停用词,以及选择分词模式 :param sent: 需分词处理的句子 :type sent: unicode string :param stop_words_file: 停用词表,设置stopwords=None为不过滤停用词。默认加载自带停用词表 :type stop_words_file: dict :param mode: 分词模式(normal,tf-idf,TextRank)。normal为普通模式分词;tf-idf为基于TF-IDF算法的关键词抽取;TextRank为基于TextRank算法的关键词抽取。默认选择的是normal模式分词 :type mode: unicode string :return: 经过分词处理过后的句子列表 :rtype: list """ sentence_words = [] # 去除标点符号及空格 # punct = set( # u''' :!),.:;?]}¢'"、。〉》」』】〕〗〞︰︱︳﹐、﹒﹔﹕﹖﹗﹚﹜﹞!),.:;?|}︴︶︸︺︼︾﹀﹂﹄﹏、~¢々‖•·ˇˉ―--′’” # ([{£¥'"‵〈《「『【〔〖([{£¥〝︵︷︹︻︽︿﹁﹃﹙﹛﹝({“‘-—_…''') # filterpunt = lambda s: ''.join(filter(lambda x: x not in punct, s)) # sent = filterpunt(sent) if addotherdic: self.addotherdics() if mode == 'tf-idf': sent = jieba.analyse.extract_tags(sent.strip(' \t\n\r')) # 基于 TF-IDF 算法的关键词抽取 elif mode == 'TextRank': tr = jieba.analyse.TextRank() # 基于 TextRank 算法的关键词抽取 tr.span = 2 sent = tr.textrank(sent.strip(' \t\n\r'), topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v')) else: sent = jieba.cut(sent.strip(' \t\n\r')) # 格式化,去换行符等. 普通模式分词 for w in sent: # seg = str(w.encode('utf-8').strip()) seg = w.strip() if stop_words_file is None: sentence_words.append(seg) else: res = SentenceSegmentation.has_number_character(seg) if not stop_words_file.has_key(seg) and not res: # 正文需要去停用词(将原来puct符号集移至停用词字典,摒除数字单个成字的可能) sentence_words.append(seg) return sentence_words
# ┃ ┃ + # ┃ ┗━━━┓ + + # ┃ ┣┓ # ┃ ┏┛ # ┗┓┓┏━┳┓┏┛ + + + + # ┃┫┫ ┃┫┫ # ┗┻┛ ┗┻┛+ + + + """ Author = Eric_Chan Create_Time = 2016/05/29 构建词库 """ import jieba import sys jieba.initialize() # 手动启动jieba模块 def load_file(file_name, charset='utf-8'): """ 读取文件,按列返回列表 :param file_name: 文件路径 :param charset: 文本内容decode的编码,默认为utf-8 :return: 文本内容列表 """ f1 = open(file_name) line = f1.readline().decode(charset).strip() line_list = [] while line: line = line.strip() if line:
page=page, error=True) except: print('high search error') # 跳转到具体的界面 @app.route('/search/<id>/', methods=['GET', 'POST']) def content(id): try: doc = find([id], extra=True) return render_template('content.html', doc=doc[0]) except: print('content error') # 推荐最接近的K个 def get_k_nearest(db_path, docid, k=5): conn = sqlite3.connect(db_path) c = conn.cursor() c.execute("SELECT * FROM knearest WHERE id=?", (docid, )) docs = c.fetchone() #print(docs) conn.close() return docs[1:1 + (k if k < 5 else 5)] # max = 5 if __name__ == '__main__': jieba.initialize() # 手动初始化(可选) app.run()
def mentioned_trend(baseurl, mysqlhostIP, mysqlUserName='******', mysqlPassword='', dbname='btv'): # 分词 jieba.initialize() # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS gala_region_interaction(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, region varchar(50), interaction bigint(50), date Date, program_id varchar(50), program varchar(50)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' os.popen('kinit -k -t ctvit.keytab ctvit') kerberos_auth = HTTPKerberosAuth(mutual_authentication=OPTIONAL) # 表名 tablename = "DATA:WEIBO_POST_Keywords" r = requests.get(baseurl + "/" + tablename + "/*", auth=kerberos_auth, headers={"Accept": "application/json"}) if issuccessful(r) == False: print "Could not get messages from HBase. Text was:\n" + r.text quit() bleats = json.loads(r.text) # 存储评论数据 tempData = [] count = 0 printCount = 0 # row_prefix, limit可以限定次数 # for key,data in table.scan(limit = 10, batch_size = 10): region_box = list() date_mentioned_dict = dict() # 时间属性 # inter为0,即为当日 inter = 0 now = int(time.time()) - 86400 * inter timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print otherStyleTime # bleats is json file for row in bleats['Row']: for cell in row['Cell']: columnname = base64.b64decode(cell['column']) value = cell['$'] if value == None: print 'none' continue if columnname == "base_info:match": column = base64.b64decode(value) if column == "春晚": if columnname == "'base_info:cdate'": cdate = base64.b64decode(value) if cdate == otherStyleTime: if columnname == "'base_info:geo'": city_mentioned = base64.b64decode(value) region_box_count = dict(Counter(region_box)) region_box_count = sorted(region_box_count.iteritems(), key=lambda e: e[1], reverse=True) for i in region_box_count: city = i[0] how_many_times = i[1] # 地域 tempData.append(city) # 出现次数 tempData.append(how_many_times) # 日期,这是插表的时间 now = datetime.datetime.now() tempData.append(now) # program_id tempData.append('12345') # 节目名称 tempData.append('2016年北京卫视春节联欢晚会') sqlcursor.execute( '''insert into gala_region_interaction(region, interaction, date, program_id, program) values (%s, %s, %s, %s, %s)''', tempData) sqlConn.commit() tempData = [] sqlConn.close()
def init_jieba(self): jieba.initialize() for key in self.data.keys(): self.data[key]['name'] = list(jieba.cut(self.data[key]['name'])) self.data[key]['syptom'] = list(jieba.cut( self.data[key]['syptom']))
class NlpUtil(object): punctuations_set = _load_words(GlobalNames.PUNCTUATIONS_FILE) stopwords_set = _load_words(GlobalNames.STOPWORDS_FILE) user_define_words = _load_words(GlobalNames.USER_DEFINE_WORDS) remove_words_set = _load_words(GlobalNames.REMOVE_WORDS_FILE) # Init jieba jieba.initialize() for w in user_define_words: jieba.add_word(w, freq=1000000) corpus_dict = None tfidf_model = None url_pattern = re.compile(r"(https|http)://.+?html") digit_pattern = re.compile(r"\d+") bracket_pattern = re.compile(r"\[.+?\]") not_place_set = set([ "京东", "上门", "东西", "拜拜", "满意度", "新旧", "入口", "莫大", "蓝牙", "英伦", "顺顺利利", "哥哥", "立马", "海鲜", "回邮", "太多", "长北", "南那", "白跑", "天黑", "天阿", "美华", "华联", "日及", "山山", "京福顺", "卡拿", "太卡", "太大", "千古", "英哥", "两棵树", "太累", "包邮", "加半", "中华人名共和国", "六便士", "串联", "非顺丰", "中考", "北冰洋", "下嫩", "安安", "太鲜", "上拉", "入店", "上下水", "图京", "之城", "中断", "中武", "伦理", "中道", "之康", "多维度", "黑边", "中爱", "之泰", "锦园店", "三国", "阿门", "肯本", "刚京麦", "大黑", "朝霞", "关门大吉", "哥别", "沧桑", "下山", "日京京", "沙沙", "牙牙", "顿顿", "山高", "钱和京", "非买", "上旧", "四科", "西东", "上岗", "大山", "福尔马林", "滑黑", "上东", "中上", "内马尔", "中同", "中达", "下欧", "四门", "深春", "正东", "江南春", "入维", "大班", "中联", "猫沙", "长卡", "几环", "尾塞", "小桥流水", "澳邮", "上中", "英雄", "镇镇", "如东", "上口", "加邮", "八国", "福利", "台基", "那本", "中邮", "六本", "维沙", "中黑", "上美", "加花", "天哇", "远超过", "大拿", "贵干", "苏中", "三本", "酒塞", "七本", "美院", "中通", "美人壶加", "中充", "下国", "京伦", "九联", "上马", "美化", "江湖", "黑店", "几米远", "午安", "七哥", "角美", "日春", "几比", "确保安全", "壶水", "荷塘月色", "云集", "拉边", "欧克", "中右", "加的京", "上路", "烟嘴", "临证指南", "串口卡", "新建", "安利", "山泉水", "苏泊尔", "墨黑", "胶盆", "长达", "商城" ]) @classmethod def place_recognize(cls, text): places = [ w for w, flag in pseg.cut(text) if "ns" in flag and len(w) >= 2 and w not in cls.not_place_set and "哈" not in w and "之" not in w and "本" not in w and "中" not in w and "嫩" not in w and "大" not in w and "鲜" not in w and "国" not in w and "上" not in w and "确" not in w and "牙" not in w and "壶" not in w and "阿" not in w and "入" not in w and "哥" not in w and "颗" not in w and "的" not in w and "联" not in w and "哇" not in w ] return places @classmethod def tokenize(cls, text, filter_punctuations=False, filter_stopwords=False, filter_alpha=False, remove_words=False, normalize_url=False, recognize_place=False, minimum_tokens_num=1): '''Tokenize text''' try: places = cls.place_recognize(text) for w in places: text = text.replace(w, "[地址x]") text = cls.digit_pattern.sub("[数字x]", text) if normalize_url: text = cls.url_pattern.sub("URL", text) tokens = jieba.lcut(text) text = " ".join(tokens) for s in cls.bracket_pattern.findall(text): text = text.replace(s, s.replace(" ", "")) text = text.replace(u"# E - s [数字x]", u"#E-s[数字x]") text = text.replace(u"# E - s DIGIT [数字x]", u"#E-s[数字x]") text = text.replace(u"< s >", "<s>") tokens = text.split() tokens_copy = copy.copy(tokens) # Filter words. if filter_punctuations: tokens = [w for w in tokens if w not in cls.punctuations_set] if filter_stopwords: tokens = [w for w in tokens if w not in cls.stopwords_set] if filter_alpha: tokens = [ w for w in tokens if not w.encode("utf-8").isalpha() or w in set(["URL"]) ] if remove_words: tokens = [w for w in tokens if w not in cls.remove_words_set] if len(tokens) < minimum_tokens_num: tokens = tokens_copy new_tokens = tokens[:1] t_len = len(tokens) for i in range(1, t_len): if tokens[i] != tokens[i - 1]: new_tokens.append(tokens[i]) return new_tokens except Exception as e: print("text=%s, errmsg=%s" % (text, e)) return [text] @classmethod def get_tfidf(cls, words): if cls.tfidf_model is None: corpus_dict_path = get_file_path(GlobalNames.CORPUS_DICT_FILE) cls.corpus_dict = corpora.Dictionary.load(corpus_dict_path) corpus_tfidf_path = get_file_path(GlobalNames.CORPUS_TFIDF_FILE) cls.tfidf_model = models.tfidfmodel.TfidfModel.load( corpus_tfidf_path) bow = cls.corpus_dict.doc2bow(words) tfidf = cls.tfidf_model[bow] tfidf = [(cls.corpus_dict[x[0]], x[1]) for x in tfidf] tfidf.sort(key=lambda x: x[1], reverse=True) return tfidf @classmethod def get_keywords(cls, text, size=3, way=None): if way == None or way == "tfidf": tokens = cls.tokenize(text) tfidf = cls.get_tfidf(tokens) ret_tokens = [x[0] for x in tfidf[:size]] return ret_tokens elif way == "textrank": return jieba.analyse.textrank(text, topK=size)
def search_init(self): jieba.initialize() self.pagerank = PageRank()
class NLPUtil(object): _valid_token_len = 5 _wordseg_pattern_cfg = [ re.compile(r'{.*?}', re.U), ] _emoji_pattern_cfg = re.compile('[\U00010000-\U0001ffff]', re.U) _replace_pattern_cfg = { 'float_t': re.compile('\d+\.\d+'), 'phone_t': re.compile('1[0-9]{10}'), 'email_t': re.compile('[^@|\s]+@[^@]+\.[^@|\s]+'), } _illegal_char_set = set([]) # init jieba jieba.initialize() ud_words = config.g_ud_words_cfg for w in ud_words: jieba.add_word(w, freq=100000000) @classmethod def remove_illegal_gbk_char(cls, text_unicode): try: text_unicode.encode('gbk') return text_unicode except UnicodeEncodeError as e: illegal_ch = e.object[e.start:e.end] illegal_set = cls._illegal_char_set illegal_set.add(illegal_ch) # try to replace directly for ch in illegal_set: text_unicode = text_unicode.replace(ch, '') # remove recursively return cls.remove_illegal_gbk_char(text_unicode) @classmethod def remove_emoji_char(cls, text_unicode): res = cls._emoji_pattern_cfg.sub('', text_unicode) return res @classmethod def conv_fenc_u8_to_gbk(cls, in_fpath, out_fpath): try: with codecs.open(in_fpath, 'r', 'utf-8') as rfd, \ codecs.open(out_fpath, 'w', 'gbk') as wfd: # read utf8, write gbk for line in rfd: line = cls.remove_illegal_gbk_char(line) wfd.write(line) except Exception as e: logger.get().warn('errmsg=%s' % (e)) @classmethod def tokenize_via_jieba(cls, text, filter_stop_word=True, norm_flag=True): tokens = jieba.lcut(text.lower()) if filter_stop_word: stop_words = config.g_stop_words_cfg tokens = filter(lambda x: x not in stop_words, tokens) if norm_flag: norm_func = cls._normalize_token return map(norm_func, tokens) else: return tokens else: return tokens @classmethod def stat_token_freq(cls, in_fpath, out_fpath): stop_words = config.g_stop_words_cfg try: word_counter = Counter() with codecs.open(in_fpath, 'r', 'utf-8') as rfd: for line in rfd: raw_str, word_seg = line.strip('\n').split('\t') tokens = word_seg.split() tokens = filter(lambda x: x not in stop_words, tokens) tokens = map(cls._normalize_token, tokens) for t in tokens: if ('{[' not in t) and len(t) <= cls._valid_token_len: word_counter[t] += 1 else: logger.get().warn('invalid token, token=%s' % (t)) # tokenize via jieba for n_t in jieba.cut(t): word_counter[n_t] += 1 logger.get().debug('jieba cut, token=%s' % (n_t)) # dump word_counter sorted_words = sorted(word_counter.keys(), key=lambda k: word_counter[k], reverse=True) with codecs.open(out_fpath, 'w', 'utf-8') as wfd: for word in sorted_words: tmp = '%s\t%s\n' % (word, word_counter[word]) wfd.write(tmp) except Exception as e: logger.get().warn('errmsg=%s' % (e)) @classmethod def _normalize_token(cls, token): token = token.lower() try: # 11 usually means phone number if len(token) != 11 and token.isdigit(): token = 'int_t' for k, v in cls._replace_pattern_cfg.items(): if v.match(token): token = k break if '{[' not in token: return token for item in cls._wordseg_pattern_cfg: token = item.sub('', token) return token except Exception as e: logger.get().warn('token=%s, errmsg=%s' % (token, e)) return token
def addCommentTable(mongodbIP, mysqlhostIP, mysqlUserName='******', mysqlPassword='******', dbname='cctvTimer'): # 读停用词 path = os.path.abspath(os.path.dirname(sys.argv[0])) dicFile = open(path + '/tools/NTUSD_simplified/stopwords.txt', 'r') stopwords = dicFile.readlines() stopwordList = [] stopwordList.append(' ') for stopword in stopwords: temp = stopword.strip().replace('\r\n', '').decode('utf8') stopwordList.append(temp) dicFile.close() # 分词 jieba.initialize() # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() # 删库 # sqlcursor.execute('''DROP TABLE IF EXISTs commentTable;''') # print '删库成功' sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS commentTable(countIndex bigint(64) primary key, commentId bigint(64), weiboId bigint(64), userId bigint(64), comment varchar(1024), sentimentKeywords varchar(128), contentKeywords varchar(1024), sentiment varchar(16), sentimentScore int(16), userName varchar(64), userSex varchar(16), userLocation varchar(64), userFollowerCount int(64), userFriendCount int(64), userStatusCount int(64), userType varchar(32), spammerJudge varchar(16), replyTime varchar(128)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' # 连接mongoDB数据库 mongoConn = pymongo.Connection(host=mongodbIP, port=27017) # check time mongoCursor = mongoConn.weibo.timestamp.find({'type': 'comment'}).limit(1) timeRangeBeginning = datetime.datetime.now() - datetime.timedelta( days=9999) # print timeRangeBeginning # a=dict() # a['type']='comment' # a['time']=timeRangeBeginning # mongoConn.weibo.timestamp.insert(a) # a=dict() # a['type']='repost' # a['time']=timeRangeBeginning # mongoConn.weibo.timestamp.insert(a) # a=dict() # a['type']='weibo' # a['time']=timeRangeBeginning # mongoConn.weibo.timestamp.insert(a) for i in mongoCursor: timeRangeBeginning = i['time'] newTimestamp = timeRangeBeginning # 查询某条微博的回复 mongoCursor = mongoConn.weibo.comment.find({ 'task_time': { '$gt': timeRangeBeginning } }).sort('task_time').batch_size(30) print '查询mongoDB成功' # 计数 sqlcursor.execute('select count(*) from commentTable;') totalCount = sqlcursor.fetchall() totalCount = list(list(totalCount)[0])[0] # 存储评论数据 commentsData = [] tempData = [] # 处理情感 emProcess = emotionProcess() rmIrr = removeIrrelevant() spamDet = spammerdetect() emotionsWord = [] emotionsScore = 0 count = 0 printCount = totalCount # 处理每一条 # try: for comment in mongoCursor: # if comment['task_time']>timeRangeBeginning+ datetime.timedelta(days=1): # continue count += 1 printCount += 1 tempData.append(printCount) # 评论id tempData.append(comment['comment_id']) # 微博id tempData.append(comment['weibo_id']) # 用户id tempData.append(comment['comment_user_id']) # 评论内容 tempData.append(comment['comment_text']) # 情感关键词 (emotionsWord, emotionsScore) = emProcess.processSentence( rmIrr.removeEverythingButEmotion(comment['comment_text'])) emotionsWord = ','.join(emotionsWord) tempData.append(emotionsWord) # print comment['mid'] # print comment['status']['mid'] # print comment['user']['id'] # print comment['text'] # 内容分词 tempcut_out = jieba.cut(rmIrr.removeEverything( comment['comment_text'])) cut_out = [] for i in tempcut_out: if i not in stopwordList: cut_out.append(i) tempData.append(','.join(cut_out)) # 倾向性判断 if emotionsScore > 0: tempData.append('正面') elif emotionsScore == 0: tempData.append('中立') else: tempData.append('负面') # sentimentScore tempData.append(emotionsScore) # 用户昵称 tempData.append(comment['comment_user_name']) # 用户性别 tempData.append(comment['comment_gender']) # 用户地域信息 tempData.append(comment['comment_location']) # 用户粉丝数 tempData.append(comment['comment_followers_count']) # 用户关注数 tempData.append(comment['comment_friends_count']) # 用户微博数 tempData.append(comment['comment_statuses_count']) # 用户类型 if (comment['comment_verified_type'] == -1): tempData.append('普通用户') elif (comment['comment_verified_type'] == 220) or (comment['comment_verified_type'] == 200): tempData.append('微博达人') elif (comment['comment_verified_type'] == 0): tempData.append('个人认证') else: tempData.append('企业认证') # 是否水军 userInfo = {} userInfo['statuses_count'] = comment['comment_statuses_count'] userInfo['followers_count'] = comment['comment_followers_count'] userInfo['friends_count'] = comment['comment_friends_count'] userInfo['bi_followers_count'] = comment['comment_bi_followers_count'] userInfo['domain'] = comment['comment_user_domain'] userInfo['url'] = comment['comment_url'] userInfo['description'] = comment['comment_description'] userInfo['location'] = comment['comment_location'] userInfo['verified'] = comment['comment_verified'] userInfo['verified_type'] = comment['comment_verified_type'] newTimestamp = comment['task_time'] spamScore = spamDet.detectSpammer(userInfo) if spamScore > 0: tempData.append("正常") else: tempData.append("水军") # 回复时间 hh = time.strptime(str(comment['comment_created_at']), '%Y-%m-%d %H:%M:%S') commentTime = time.strftime("%a %b %d %H:%M:%S %Y", hh) tempData.append(commentTime) # 转换为元组 commentsData.append(tuple(tempData)) tempData = [] if count >= 10: sqlcursor.executemany( '''insert into commentTable(countIndex, commentId, weiboId, userId, comment, sentimentKeywords, contentKeywords, sentiment, sentimentScore, userName, userSex,userLocation, userFollowerCount, userFriendCount, userStatusCount, userType,spammerJudge, replyTime) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', commentsData) sqlConn.commit() commentsData = [] count = 0 print '插入' + str(printCount) + '个' # # except: # # print tempData sqlcursor.executemany( '''insert into commentTable(countIndex, commentId, weiboId, userId, comment, sentimentKeywords, contentKeywords, sentiment, sentimentScore, userName, userSex, userLocation, userFollowerCount, userFriendCount, userStatusCount, userType,spammerJudge, replyTime) values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)''', commentsData) sqlConn.commit() sqlConn.close() mongoConn.weibo.timestamp.update({'type': 'comment'}, {'$set': { 'time': newTimestamp }}) mongoConn.close()
#coding=utf-8 ''' Created on 2014-2-22 @author: yuzhang ''' import jieba.posseg as jbp import jieba as jb import time jb.enable_parallel() jb.initialize() text = ''' ''' start = time.clock() for i in range(1000000): jb.cut(text) print time.clock() - start start = time.clock() for i in range(1000000): jbp.cut(text) print time.clock() - start
def test(): if FLAGS.src_word_seg == 'word': import jieba jieba.initialize() sess = tf.Session() src_vocab_dict, _ = data_utils.read_map(FLAGS.source_data + '.' + str(FLAGS.src_vocab_size) + '.mapping') _ , trg_vocab_list = data_utils.read_map(FLAGS.target_data + '.' + str(FLAGS.trg_vocab_size) + '.mapping') model = create_seq2seq(sess, 'TEST') model.batch_size = 1 sys.stdout.write("Input sentence: ") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print('sentence: ',sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence]) while(sentence): token_ids = data_utils.convert_to_token(tf.compat.as_bytes(sentence), src_vocab_dict, False) bucket_id = len(buckets) - 1 for i, bucket in enumerate(buckets): if bucket[0] >= len(token_ids): bucket_id = i break # Get a 1-element batch to feed the sentence to the model. encoder_input, decoder_input, weight = model.get_batch({bucket_id: [(token_ids, [], "", "")]}, bucket_id) # Get output logits for the sentence. output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) # This is a greedy decoder - outputs are just argmaxes of output_logits. # beam search all if bool(model.beam_search) is True: if bool(FLAGS.debug): outs = [] for _ in range(model.beam_size): outs.append([]) for out in output: for i,o in enumerate(out): outs[i].append(o) outs = np.array(outs) #print('outs: ',outs.shape) outputss = [] for out in outs: #print('out: ',out.shape) outputs = [int(np.argmax(logit)) for logit in out] outputss.append(outputs) for i,outputs in enumerate(outputss): sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]) sys_reply = data_utils.sub_words(sys_reply) sys_reply = qulify_sentence(sys_reply) if i == 0: print(colored("Syetem reply(bs best): " + sys_reply,"red")) else: print("Syetem reply(bs all): " + sys_reply) else: output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) outputs = [int(np.argmax(logit, axis=1)) for logit in output] if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]) sys_reply = data_utils.sub_words(sys_reply) sys_reply = qulify_sentence(sys_reply) print("Syetem reply(bs best): " + sys_reply) # MLE else: output = model.run(sess, encoder_input, decoder_input, weight, bucket_id) print(output) print('output: ', len(output), output.shape, output[0].shape) outputs = [int(np.argmax(logit, axis=1)) for logit in output] # If there is an EOS symbol in outputs, cut them at that point. if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] sys_reply = "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs]) sys_reply = data_utils.sub_words(sys_reply) sys_reply = qulify_sentence(sys_reply) print("Syetem reply(MLE): " + sys_reply) # Print out French sentence corresponding to outputs. #print("Syetem reply: " + "".join([tf.compat.as_str(trg_vocab_list[output]) for output in outputs])) print ("User input : ") sys.stdout.flush() sentence = sys.stdin.readline() if FLAGS.src_word_seg == 'word': sentence = (' ').join(jieba.lcut(sentence)) print ('sentence: ', sentence) elif FLAGS.src_word_seg == 'char': sentence = (' ').join([s for s in sentence])
def main(): jieba.initialize( "/home/Ming-Yi/MingYi/Behavior/Behavior/dict/dict.txt.big") jieba.load_userdict( "/home/Ming-Yi/MingYi/Behavior/Behavior/dict/NameDict_Ch_v2") Read_Json_Data("rawdata/" + creat_dir_paht)
def main(args): # varibales input_text_folder = join('..', 'input_ASR_results') conn = MongoClient('localhost', 27017) db = conn.googlecrawlstream timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') print('time stamp:', timestamp) matchfile_pre = 'fcr23.ws.re.wav.all2' matchfile_tmp = 'fcr23.ws.re.wav.all2.res' matchfile_result = 'fcr23.ws.re.wav.all2.match' #load config if args.load_config: with open('config', 'r', encoding='utf8') as f: config = json.loads(f.readlines()[0].strip()) outputpath = config['outputpath'] Nasgoogle_crawl_dir = config['Nasgoogle_crawl_dir'] ASR_result = config['ASR_result'] bashfilepath = config['bashfilepath'] input_text_folder = config['input_text_folder'] finishpath = config['finishpath'] else: outputpath = args.output_path Nasgoogle_crawl_dir = args.google_crawl_dir ASR_result = args.pinyin_access_ASR_result_path bashfilepath = args.bashfilepath input_text_folder = args.python_access_input_folder finishpath = args.web_data_path #os.makedirs(outputpath, exist_ok=True) #os.makedirs(finishpath, exist_ok=True) searchEngine = args.search_engine # firebaseurl = config['firebaseurl'] # fb = firebase.FirebaseApplication(firebaseurl,None) jieba.set_dictionary('dict.txt.big') jieba.initialize() # load from input text path input_text_path = [ join(input_text_folder, os.path.basename(x)) for x in glob.glob(join(input_text_folder, ('*'))) if '.cm' in x and '.cm2' not in x and '.syl' not in x ] #print(input_text_path) input_text_path = sorted(input_text_path, key=functools.cmp_to_key(myCompare)) search_enging = search() #searchEngine = 'Google' for eachTarget in [ reconstruct_search_words(eachpath, 0.845) for eachpath in input_text_path ]: for filename, keywordlist in eachTarget.items(): crawlflow = {} # get web urls from google each 15 seconds logger.info('Start: ' + filename) n_segment_urls = {} # is there a repetition in urls alldata = [] if not args.special_file_name == '': if not filename == args.special_file_name: continue print(filename, keywordlist) thisTurnData = [] crawlflow['keywordlist'] = keywordlist for keyword in keywordlist: crawlflow['filename'] = filename crawlflow['keyword'] = keyword [ os.remove(filename) for filename in glob.glob( join(outputpath, ('fcr23.ws.re.wav*'))) ] [ os.remove(filename) for filename in glob.glob(join(outputpath, ('*.txt'))) ] [ os.remove(filename) for filename in glob.glob(join(outputpath, ('*.line'))) ] tFirstStart = time.time() if searchEngine == 'Google': webUrls = search_enging.google_get_url( keyword) # crawl google elif searchEngine == 'Bing': webUrls = search_enging.bing_get_url( keyword) # crawl google crawlflow['searchEngine'] = searchEngine crawlflow['webUrls'] = webUrls crawlflow['round'] = keywordlist.index(keyword) for url in webUrls: if url in n_segment_urls: n_segment_urls[url] += 1 webUrls.remove(url) else: n_segment_urls[url] = 1 #pool = mp.Pool() #thisTurnData = pool.map(crawlpage,webUrls) thisTurnData = Parallel(n_jobs=-1, backend="threading")( delayed(crawlpage)(url) for url in webUrls) alldata.extend(thisTurnData) #pool.close() #pool.join() crawlPagetime = str(int(time.time() - tFirstStart)) crawlflow['crawlPagetime'] = crawlPagetime thisTurnData = [ data for data in thisTurnData if len(''.join(data)) < 30000 and not data == '' and not data == [] ] after_filter_page_num = len(thisTurnData) crawlflow['afterFilterPageNum'] = after_filter_page_num if not thisTurnData: crawlflow['filename'] = filename + '-' + str( keywordlist.index(keyword)) db[timestamp + 'fail'].insert_one(crawlflow.copy()) #fb.post('/'+timestamp+'fail', crawlflow) # crawlflow = {} # crawlflow['filename'] = filename # crawlflow['keywordlist'] = keywordlist continue # write down those data from web page for data in thisTurnData: webcontent = ''.join(data) if len(webcontent) > 0: with open(join( outputpath, filename + '-' + str(alldata.index(data)) + '.txt'), 'w', encoding='utf8') as f: f.write(''.join(data)) # use pin yin to transfer data tStart = time.time() rq.get( bashfilepath + '?text={}&asr={}'.format(Nasgoogle_crawl_dir, ASR_result)) tranfPinYintime = str(int(time.time() - tStart)) crawlflow['tranfPinYintime'] = tranfPinYintime tStart = time.time() # use match method to find paragraph p1 = subprocess.Popen([ 'python3', 'generate_diff.py', join(outputpath, matchfile_pre), join(outputpath, matchfile_tmp) ], cwd="Match/wav_matched/", stdout=subprocess.PIPE, shell=False) p1.wait() p2 = subprocess.Popen([ 'python3', 'filter_crawl_result.py', join(outputpath, matchfile_tmp), join(outputpath, matchfile_result) ], cwd="Match/wav_matched/", stdout=subprocess.PIPE, shell=False) p2.wait() matchFunctiontime = str(int(time.time() - tStart)) crawlflow['matchFunctiontime'] = matchFunctiontime # Analyze - read match file and decide to query this file or not if analyze(filename, outputpath, finishpath, 0.9, thisTurnData, input_text_path, crawlflow)[0] == 'Get paragraph': # from crawlflow['oriASRresult'] to compare with crawlflow['paragraph'] crawl_compare_match = SequenceMatcher( None, crawlflow['oriASRresult'], crawlflow['paragraph']).get_matching_blocks() same_sents = [ crawlflow['oriASRresult'][m[0]:m[0] + m[2]] for m in crawl_compare_match ] same_sents = [ sentence for sentence in same_sents if len(sentence) > 1 ] crawlflow['oriASRresult'] = crawlflow[ 'oriASRresult'].replace(' ', '') crawlflow['paragraph'] = crawlflow['paragraph'].replace( ' ', '') opc1 = SequenceMatcher( None, crawlflow['oriASRresult'], crawlflow['paragraph']).get_opcodes() hint_dict = {} for tag, i1, i2, j1, j2 in opc1: if tag == 'replace': hint_dict[(j1, j2)] = crawlflow['paragraph'][j1:j2] jiebacut_result = [ w for w in jieba.cut(crawlflow['paragraph']) ] orihints = [ crawlflow['paragraph'][j1:j2] for tag, i1, i2, j1, j2 in opc1 if tag == 'replace' ] # hints.extend(same_sents) crawlflow['orihints'] = orihints reconstruct_hints = diff_word_reconstruct( hint_dict, jiebacut_result, crawlflow['paragraph']) hints = reconstruct_hints.copy() crawlflow['reconstruct_hints'] = reconstruct_hints tmpparagraph = crawlflow['paragraph'] for hint in hints: tmpparagraph = tmpparagraph.replace(hint, ' ') hints.extend([ sent for sent in ''.join(tmpparagraph).split(' ') if len(sent) > 1 ]) hintlength = 0 tmphint = [] for hint in hints: hintlength += len(hint) if hintlength >= 5000: break else: if len(hint) < 100: tmphint.append(hint) crawlflow['hints'] = tmphint db[timestamp].insert_one(crawlflow.copy()) #fb.post('/'+timestamp, crawlflow) break else: thisTurnData = [] crawlflow['filename'] = filename + '-' + str( keywordlist.index(keyword)) db[timestamp + 'fail'].insert_one(crawlflow.copy()) #fb.post('/'+timestamp+'fail', crawlflow) # crawlflow.clear() # crawlflow['filename'] = filename # crawlflow['keywordlist'] = keywordlist #x = input('wait here') tEnd = time.time() sleeptime = 15 - int(tEnd - tFirstStart) if sleeptime > 0 and searchEngine == 'Google': print('sleep', sleeptime) time.sleep(sleeptime)
class ProcessHandler(tornado.web.RequestHandler): # Global instance to store todos. You should use a database in reality. jieba.initialize() #mysqlHandler = MysqlHandler('localhost', 'root', 'Aqaz123!', 't5_rent') mysqlHandler = MysqlHandler(MYSQL_HOST, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DATABASE) detail_json = "" with open('/root/product/t5_rent/room_data') as fd: txt = fd.read().replace('\n','') detail_json = txt.decode('gbk').encode('utf-8') key = RSA.generate(2048) priv_pem = key.exportKey() pub_pem = key.publickey().exportKey() priv_key = RSA.importKey(priv_pem) pub_key = RSA.importKey(pub_pem) def send_res(self, result): final_result = '{"retcode":-1,"result":[]}'; if (result != NO_RESULT): final_result = result self.write(final_result) def check_user(self, claims): return True def get(self): # return all todos # Just dump data to json, and return it operation = self.request.uri ''' try: token = self.get_argument('token') except: payload = { 'site': 'zhiliaohou.online', 'name': 'litong'} token = jwt.generate_jwt(payload, self.priv_key, 'RS256', datetime.timedelta(minutes=5)) self.send_res('{"token":"%s"}' % token) return if (token == None or token == ""): self.send_res(NO_RESULT) return else: try: header, claims = jwt.verify_jwt(token, self.pub_key, ['RS256']) except: self.send_res(NO_RESULT) return if (self.check_user(claims) is False): self.send_res(NO_RESULT) return ''' t1 = int(time.time()*1000) if operation.find('house_list') != -1: result_list = [] try: queryString = self.get_argument('query') print queryString seg_list = jieba.cut_for_search(unquote(queryString), HMM=False) print seg_list except: seg_list = None print seg_list doclist = self.mysqlHandler.GetDocIdList(seg_list) ret_dic = [] for doc in doclist: print doc docs = self.mysqlHandler.GetContent(doclist) for doc in docs: (id, title, subdistrict, faceto, floor, year, dinner_num, room_num, fitment, area, pic) = doc house = House(id, title, subdistrict, faceto, floor, year, dinner_num, room_num, fitment, area, "", "", pic) house_dic = house.to_dict() result_list.append(house_dic) final_result = {'retcode':len(result_list), 'result': result_list} json_result = simplejson.dumps(final_result) #print json_result #send_str = str(simplejson.loads(json_result)).decode('utf8').encode('raw_unicode_escape') #print send_str self.send_res(json_result) t2 = int(time.time()*1000) print "[%s] cost=%d,ret=%d" % ((datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')), t2 - t1, len(result_list)) return elif operation.find('house_detail') != -1: self.send_res(self.detail_json) t2 = int(time.time()*1000) print "[%s] cost=%d,ret=%d" % ((datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')), t2 - t1, 0) return
def __init__(self): jieba.initialize() jieba.enable_parallel(8)
def jieba_initialize(): jieba.load_userdict( os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) + '/resources/QAattrdic.txt') jieba.initialize()
def __init__(self): jieba.initialize() self.ltpseg = pyltp.Segmentor() self.ltpseg.load('model/ltp_data_v3.4.0/cws.model') self.thu1 = thulac.thulac(seg_only=True) pynlpir.open()
def __init__(self, userdict): preNormalSeg.__init__(self) jb.load_userdict(userdict) jb.initialize()
def __init__(self) -> None: import jieba import jieba.posseg as pseg self.__tokenize = pseg.cut jieba.initialize()
#http://www.oss.io/p/fxsjy/jieba ''' 1. 分词 jieba.cut 方法接受三个输入参数: 需要分词的字符串;cut_all 参数用来控制是否采用全模式;HMM 参数用来控制是否使用 HMM 模型 jieba.cut_for_search 方法接受两个参数:需要分词的字符串;是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词,粒度比较细 待分词的字符串可以是 unicode 或 UTF-8 字符串、GBK 字符串。注意:不建议直接输入 GBK 字符串,可能无法预料地错误解码成 UTF-8 jieba.cut 以及 jieba.cut_for_search 返回的结构都是一个可迭代的 generator,可以使用 for 循环来获得分词后得到的每一个词语(unicode),或者用 jieba.lcut 以及 jieba.lcut_for_search 直接返回 list jieba.Tokenizer(dictionary=DEFAULT_DICT) 新建自定义分词器,可用于同时使用不同词典。jieba.dt 为默认分词器,所有全局分词相关函数都是该分词器的映射。 ''' import jieba import time jieba.initialize() #手动初始化 time.sleep(1) seg_list = jieba.cut("我来到北京清华大学", cut_all=True) print("Full Mode: " + "/ ".join(seg_list)) # 全模式 seg_list = jieba.cut("我来到北京清华大学", cut_all=False) print("Default Mode: " + "/ ".join(seg_list)) # 精确模式 seg_list = jieba.cut("他来到了网易杭研大厦") # 默认是精确模式 print(", ".join(seg_list)) seg_list = jieba.cut_for_search("小明硕士毕业于中国科学院计算所,后在日本京都大学深造") # 搜索引擎模式 print(", ".join(seg_list)) ''' 2. 添加自定义词典
def __init__(self, **kwargs) -> None: jieba.initialize() pass
def __init__(self, general_thesaurus_path): """Read the general thesaurus""" jieba.initialize(general_thesaurus_path)
for tk in result: print("%s \t start at: %d \t end at: %d" % (tk[0], tk[1], tk[2])) # In[23]: # ###搜索模式 # 把句子中所有的可以成词的词语都扫描出来并确定位置。 result = jieba.tokenize(u"永和服装饰品有限公司", mode="search") for tk in result: print("%s \t start at: %d \t end at: %d" % (tk[0], tk[1], tk[2])) # In[ ]: # ## Part 7 延迟加载机制 # * jieba采用延迟加载,import jieba 和 jieba.Tokenizer() 不会立即触发词典的加载,一旦有必要才开始加载词典构建前缀字典。如果你想手工初始jieba,也可以手动初始化 # In[24]: import jieba jieba.initialize() # 手动初始化,可选 # In[25]: # 在 0.28 之前的版本是不能指定主词典的路径的,有了延迟加载机制后,你可以改变主词典的路径: # jieba.set_dictionary("data/dict.txt.big") # 也可以下载你所需要的词典,然后覆盖jieba/dict.txt即可。 # In[ ]:
def jieba_initialize(): jieba.load_userdict(os.path.dirname(os.path.split(os.path.realpath(__file__))[0])+'/resources/QAattrdic.txt') jieba.initialize()
import codecs import re import jieba #import jieba.posseg import jieba.analyse import logging import os from functions import * from hanziconv import HanziConv import argparse """ This is a file to pre-process the data """ logging.basicConfig() #level=logging.NOTSET) jieba.initialize() # (optional) parser = argparse.ArgumentParser( description='preprocess files to remove unrelated flags') parser.add_argument('--data_path', type=str, help='the origin data path') parser.add_argument('--aim_path', type=str, help='the path of processed data ') parser.add_argument( '--process_answer', type=lambda s: s.lower() in ['true', 't', 'yes', '1'], default=False, help= 'a switch to to process answer ,set true when processing training data ') args = parser.parse_args() fp = codecs.open(args.data_path, "r", "utf-8")
#!/usr/bin/env python #encoding:utf8 import jieba import jieba.posseg as pseg jieba.initialize(dictionary="dict.txt") from pyspark import SparkContext def tokenize(text): docid, body = text.split('\t', 1) items = [] for word,flag in pseg.cut(body): items.append('%s/%s'%(word,flag)) result = "%s\t%s"%(docid, ' '.join(items)) return result if __name__ == "__main__": sc =SparkContext(appName="Python Tokens") #input_file = 'liuxufeng/nlp/doc_text/part-00040' input_file = 'liuxufeng/nlp/doc_text/*' bodies = sc.textFile(input_file) items = bodies.map(tokenize) #.collect() #for item in items: # print item.encode("utf8") items.saveAsTextFile("liuxufeng/nlp/doc_tokens") sc.stop()
default=True, help="don't use the Hidden Markov Model") parser.add_argument("-q", "--quiet", action="store_true", default=False, help="don't print loading messages to stderr") parser.add_argument("-V", '--version', action='version', version="Jieba " + jieba.__version__) parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() if args.quiet: jieba.setLogLevel(60) delim = str(args.delimiter) cutall = args.cutall hmm = args.hmm fp = open(args.filename, 'r') if args.filename else sys.stdin if args.dict: jieba.initialize(args.dict) else: jieba.initialize() if args.user_dict: jieba.load_userdict(args.user_dict) ln = fp.readline() while ln: l = ln.rstrip('\r\n') print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm))) ln = fp.readline() fp.close()
# coding=utf8 import logging logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) import re import json import codecs import jieba jieba.initialize() #manual initialize jieba # import jieba.analyse import jieba.posseg as pseg # import redis import sys reload(sys) sys.setdefaultencoding('utf-8') import os from gensim import corpora, models, similarities from flask import Flask, request, abort,g,current_app # from werkzeug.contrib.fixers import ProxyFix app = Flask(__name__) project_path = './' docpath='/home/workspace/news' # @app.before_first_request # @app.before_request def appd(): app.config['stopwords'] = codecs.open(project_path + 'stopwords.txt', encoding='UTF-8').read() app.config['dictionary'] = corpora.Dictionary.load(project_path + 'lsi/' + 'viva.dict')
def fenci_initalize(): jieba.initialize()
from jieba import lcut as jc from jieba import initialize from pkuseg import pkuseg from thulac import thulac from telegram import InputTextMessageContent, InlineQueryResultArticle from telegram.ext import ( Updater, CommandHandler, CallbackContext, InlineQueryHandler, ) from telegram.update import Update from config import BOT_TOKEN initialize() tc = thulac(seg_only=True).cut # pylint: disable=C0103 pc = pkuseg().cut # pylint: disable=C0103 def words(update: Update, context: CallbackContext) -> None: """Words the inline message.""" query = update.inline_query.query resj = " ".join(jc(query, cut_all=False)) rest = tc(query, text=True) resp = " ".join(pc(query)) print("--" * 10) print(query, end="\n")
#coding=utf-8 ''' 基于词典的情绪化分法 ''' __author__ = 'Eric_Chan' import re import jieba import chardet import time jieba.initialize() #手动启动结巴模块 print '结巴系统启动完毕' mood_dist = { 0:'厌恶',1:'同情',2:'喜欢',3:'怨恨',4:'悲伤',5:'愉快',6:'愤怒',7:'焦虑',8:'其他'} def load_word_data(filename):#获得情绪词条词典 file1 = open('/Users/Har/Desktop/DM/舆情/学习/基于规则的情绪划分/emotion_words/情绪词/%s'%filename,'r') line = file1.readline().strip() words = [] while line: charset = chardet.detect(line) #检测文件编码 code = charset['encoding'] # print code line = line.decode(code,'ignore') words.append(line) line = file1.readline().strip() file1.close() file2 = open('/Users/Har/Desktop/DM/舆情/学习/基于规则的情绪划分/emotion_words/网络新词/%s'%filename,'r') line = file2.readline().strip()
def __init__(self, ranker): jieba.initialize() jieba.enable_parallel(8) self.ranker = ranker
def addcustomerEvaluation_informal(hbaseIP, mysqlhostIP, mysqlUserName='******', mysqlPassword='', dbname='btv'): # 读停用词 path = os.path.abspath(os.path.dirname(sys.argv[0])) dicFile = open(path + '/tools/NTUSD_simplified/stopwords.txt', 'r') stopwords = dicFile.readlines() stopwordList = [] stopwordList.append(' ') for stopword in stopwords: temp = stopword.strip().replace('\r\n', '').decode('utf8') stopwordList.append(temp) dicFile.close() # 分词 jieba.initialize() source_1 = 'weibo' # 连接数据库 sqlConn = MySQLdb.connect(host=mysqlhostIP, user=mysqlUserName, passwd=mysqlPassword, db=dbname, charset='utf8') sqlcursor = sqlConn.cursor() sqlcursor.execute( '''CREATE TABLE IF NOT EXISTS media_evaluation(pk bigint NOT NULL PRIMARY KEY AUTO_INCREMENT, flag int(1), evaluation bigint(20), content varchar(200), date Date, program_id varchar(200), program varchar(200)) DEFAULT CHARSET=utf8;''' ) print '新建库成功' # 时间属性 # inter为0,即为当日 # 库中是2.29 inter = 37 now = int(time.time()) - 86400 * inter timeArray = time.localtime(now) otherStyleTime = time.strftime("%Y-%m-%d", timeArray) print otherStyleTime # 连接hbase数据库 conn = happybase.Connection(hbaseIP) conn.open() # 存储评论数据 # 处理情感 emProcess = emotionProcess() rmIrr = removeIrrelevant() # 首先获取栏目,注意栏目及相关hbase存储信息有更新,请在hbase_info做同步 # print 'SELECT DISTINCT(program) from hbase_info where source = %s' %source_1 sqlcursor.execute( "SELECT DISTINCT(program) from hbase_info where source = 'buzz';") bufferTemp = sqlcursor.fetchall() # print len(bufferTemp) for one_program in bufferTemp: commentsData = [] tempData = [] one_program = one_program[0].encode('utf8') print type(one_program), one_program sqlcursor.execute( '''SELECT hbase_table from hbase_info where program = %s and source = %s;''', (one_program, source_1)) bufferTemp = sqlcursor.fetchone() program_hbase_table = bufferTemp[0] print program_hbase_table # 以“JQJM”为关键词的微博原贴代替,按理应该是底下的评论 table = conn.table(str(program_hbase_table)) # customerEvaluation_informal需要有program标识 emotionsWord = [] emotionsScore = 0 count = 0 printCount = 0 sqlcursor.execute( '''SELECT program_id from competition_analysis where program = %s''', (one_program, )) bufferTemp = sqlcursor.fetchone() program_id = bufferTemp[0] print program_id # row_prefix, limit可以限定次数 for key, data in table.scan(limit=10, batch_size=10): # print 'hhh',key,data # for key,data in table.scan(row_prefix = 'row', limit = 10, batch_size = 10): date_created = data['base_info:cdate'] if date_created == otherStyleTime: content = data['base_info:text'] print 'q', content # 暂时没有program_id # program_id = data['base_info:program_id'] count += 1 printCount += 1 # 处理每一条 # 情感关键词 (emotionsWord, emotionsScore) = emProcess.processSentence( rmIrr.removeEverythingButEmotion(content)) # 倾向性判断flag:1是正面,0是中性,-1是负面 # 情感极性判断,这里我限制了更严格的条件 if emotionsScore > 0: tempData.append('1') elif emotionsScore == 0: tempData.append('0') elif emotionsScore < 0: tempData.append('-1') # 情感得分sentimentScore tempData.append(emotionsScore) # 评论内容 tempData.append(content) # 日期时间 tempData.append(otherStyleTime) # 栏目id tempData.append(program_id) # 栏目名称 tempData.append(one_program) # 转换为元组 commentsData.append(tuple(tempData)) tempData = [] if count >= 10: sqlcursor.executemany( '''insert into media_evaluation(flag, evaluation, content, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''', commentsData) sqlConn.commit() commentsData = [] count = 0 print '插入' + str(printCount) + '个' # # except: # # print tempData sqlcursor.executemany( '''insert into media_evaluation(flag, evaluation, content, date, program_id, program) values (%s, %s, %s, %s, %s, %s)''', commentsData) sqlConn.commit() sqlConn.close()
nargs='?', const=' ', help="use DELIM instead of ' / ' for word delimiter; use a space if it is without DELIM") parser.add_argument("-a", "--cut-all", action="store_true", dest="cutall", default=False, help="full pattern cutting") parser.add_argument("-n", "--no-hmm", dest="hmm", action="store_false", default=True, help="don't use the Hidden Markov Model") parser.add_argument("-q", "--quiet", action="store_true", default=False, help="don't print loading messages to stderr") parser.add_argument("-V", '--version', action='version', version="Jieba " + jieba.__version__) parser.add_argument("filename", nargs='?', help="input file") args = parser.parse_args() if args.quiet: jieba.setLogLevel(60) delim = unicode(args.delimiter) cutall = args.cutall hmm = args.hmm fp = open(args.filename, 'r') if args.filename else sys.stdin jieba.initialize() ln = fp.readline() while ln: l = ln.rstrip('\r\n') print(delim.join(jieba.cut(ln.rstrip('\r\n'), cutall, hmm)).encode('utf-8')) ln = fp.readline() fp.close()
def __init__(self): config = Config() #jieba.set_dictionary(config.zh_dict_txt_big) #jieba.load_userdict(config.zh_my_dict) jieba.initialize() self.model = models.Word2Vec.load(config.word2vec_model_zh)