def get_item(marc_no, status=0): dict = {} test = PinYin() test.load_word('word.data') hm = requests.get(ourl + 'item.php?marc_no=' + str(marc_no)).text.encode( encoder).decode('utf8').replace(' ', '') parser = HTMLParser.HTMLParser() s1 = parser.unescape(hm) static = re.findall('<div id="book_info">(.*?)<div class="clear"></div>', s1, re.S)[0] booklist = re.findall('<dl class="booklist">(.*?)</dl>', static, re.S) for each in booklist: pm = re.findall('<dt>(.*?)</dt>', each, re.S)[0] if pm == '': continue st = re.findall('<dd>(.*?)</dd>', each, re.S)[0] try: st1 = re.findall('>(.*?)</a>', st, re.S)[0] except: st1 = st pms = test.hanzi2pinyin_split(string=pm, split="", firstcode=True).replace('/', '') dict[pms] = st1 if status == 1: print pm, print st1 return dict
def main(): mysql = MySQLHander() p = PinYin() p.load_word() with open('video.json') as json_file: alldata = json.load(json_file) for data in alldata: sql = "INSERT INTO resource VALUES (null, '', '{title}', '{desc}', '{thumb}', '{url}',{duration}, {vister}, {likes},{creat_time}, '{up_time}')".format(title=data['name'].encode('utf-8', 'ignore'), desc=data['name'].encode('utf-8', 'ignore'), thumb=data['thumb'], url=data['url'],duration=random.randint(80, 120), vister=random.randint(4500, 9999), likes=random.randint(500, 2000), creat_time=time.time(), up_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) rid = mysql.insert(sql) for tag in xrange(1, 2): tagname = data["tag{0}".format(tag)].encode('utf-8', 'ignore') sql = "SELECT id from category WHERE cname='{0}'".format(tagname) mysql.query(sql) result = mysql.fetchOneRow() if not result: ename = p.hanzi2pinyin_split(string=tagname, split="-").replace('-', '') sql = "INSERT INTO category values(null, '{ename}', '{cname}', {time})".format(ename=ename, cname=tagname, time=int(time.time())) tagid = mysql.insert(sql) sql = "INSERT INTO category_mapping values(null, {cid}, {rid}, {time})".format(cid=tagid, rid=rid, time=int(time.time())) mysql.insert(sql) else: tagid = result[0] sql = "INSERT INTO category_mapping values(null, {cid}, {rid}, {time})".format(cid=tagid, rid=rid, time=int(time.time())) mysql.insert(sql) for pic in data['screen']: sql = "INSERT INTO screenshots values(null, {rid}, '{pic}', {time})".format(rid=rid, pic=pic, time=int(time.time())) mysql.insert(sql) print "{0} done".format(data['thumb']) mysql.close()
def getciyun(self): # 得到词云回答者信息 test1 = PinYin() test1.load_word() str1 = str(test1.hanzi2pinyin_split(string=str(self.aa.topic), split="-")) path1 = 'F:/zhihu/answer/people_qb.txt' cloud.ciyun1(path1,str1+'people') #得到词云,问题信息 path2='F:/zhihu/answer/question_top10.txt' cloud.ciyun1(path2,str1+'question') path2 = 'F:/zhihu/answer/p_location.txt' cloud.ciyun1(path2,str1+'slocation')
def draw_frame(faces, img, gray, move): global xdeg global ydeg global fps global time_t if move == 2: steering_control(faces, img) # Draw a rectangle around every face for (x, y, w, h) in faces: cv2.rectangle(img, (x, y), (x + w, y + h), (200, 255, 0), 2) #-----rec-face roi = gray[x:x + w, y:y + h] try: roi = cv2.resize(roi, (200, 200), interpolation=cv2.INTER_LINEAR) params = model.predict(roi) if params[1] < 500.0: #print (names[params[0]]) #pec = (' %.2f' % (params[1])) #sign = names[params[0]] + pec pyin = PinYin() pyin.load_word() pname = names[params[0]] change_identity(pname) #pyin.hanzi2pinyin(string = pname) pname = pyin.hanzi2pinyin_split(string=pname, split='') s = '' for p in pname: s = s + p sign = ("%s %.2f" % (s, params[1])) # print(sign) cv2.putText(img, sign, (x, y - 2), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) #img = cv2ImgAddText(img, sign , x , y - 2, (0, 0, 255), 20) #img = change_cv2_draw(img,sign,(x, y + 2), 20 , 'firebrick' ) except: continue # Calculate and show the FPS fps = fps + 1 sfps = fps / (time.time() - t_start) cv2.putText(img, "FPS : " + str(int(sfps)), (10, 15), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2) cv2.imshow("recognize-face", img)
def mallSearch(): kw = MyRequest.getParams('kw') if kw: kw = addslashes(kw) malls = Mall.query.filter(Mall.keyword.like('%'+kw+'%')).all() keyword = Keyword.query.filter_by(keyword=kw).first() if keyword: if len(malls) > keyword.items: keyword.items = len(malls) if timetodate(keyword.updatetime,0) == timetodate(g.siteTime,0): keyword.month_search += 1 else: keyword.month_search = 1 if timetodate(keyword.updatetime,8) == timetodate(g.siteTime,8): keyword.week_search += 1 else: keyword.week_search = 1 if timetodate(keyword.updatetime,3) == timetodate(g.siteTime,3): keyword.today_search += 1 else: keyword.today_search = 1 keyword.total_search += 1 keyword.updatetime = g.siteTime else: keyword = Keyword() keyword.keyword = kw keyword.items = len(malls) keyword.updatetime = g.siteTime keyword.month_search = 1 keyword.week_search = 1 keyword.today_search = 1 keyword.total_search = 1 py = PinYin(g.rootpath+os.path.sep+'assets'+os.path.sep+'word.data') py.load_word() keyword.letter = py.hanzi2pinyin_split(string=kw, split=" ") db.session.add(keyword) db.session.commit() rewords = Keyword.query.filter(Keyword.keyword.like('%'+kw+'%')).limit(10).all() return render_template('mall/mallSearch.html',malls=malls,g=g,kw=kw,rewords=rewords) else: return redirect(url_for('index'))
class changetopinyin: wf_dict={} def __init__(self): self.test=PinYin() self.test.load_word() def change(self,filename): with open(filename,'r') as ff: for item in ff.readlines(): word,fre=item.split(' ')[0],int(item.split(' ')[1]) wf=word_fre(word,fre) self.addtodict(wf) for item in self.wf_dict.itervalues(): i=0 for ii in item: try: item[i]=(ii.word,ii.fre) except Exception as e: print(e) i+=1 self.save('pinyin_dict2') def addtodict(self,wf): pp=self.test.hanzi2pinyin_split(wf.word,'_') if(pp in self.wf_dict): if(len(self.wf_dict[pp])<=5): heapq.heappush(self.wf_dict[pp],wf) else: heapq.heappushpop(self.wf_dict[pp],wf) else: self.wf_dict[pp]=[] self.wf_dict[pp].append(wf) def save(self,filename): with open(filename,'w') as wff: try: wff.write(json.dumps(self.wf_dict)) except Exception as e: print(e)
class changetopinyin: wf_dict = {} def __init__(self): self.test = PinYin() self.test.load_word() def change(self, filename): with open(filename, 'r') as ff: for item in ff.readlines(): word, fre = item.split(' ')[0], int(item.split(' ')[1]) wf = word_fre(word, fre) self.addtodict(wf) for item in self.wf_dict.itervalues(): i = 0 for ii in item: try: item[i] = (ii.word, ii.fre) except Exception as e: print(e) i += 1 self.save('pinyin_dict2') def addtodict(self, wf): pp = self.test.hanzi2pinyin_split(wf.word, '_') if (pp in self.wf_dict): if (len(self.wf_dict[pp]) <= 5): heapq.heappush(self.wf_dict[pp], wf) else: heapq.heappushpop(self.wf_dict[pp], wf) else: self.wf_dict[pp] = [] self.wf_dict[pp].append(wf) def save(self, filename): with open(filename, 'w') as wff: try: wff.write(json.dumps(self.wf_dict)) except Exception as e: print(e)
class correction: def __init__(self): self.pp=PinYin() self.pp.load_word() with open('pinyin_dict','r') as ff: line=ff.readline() self.jj_dict=json.loads(line) ff.close() def correct(self,phrase_list): termlist=[] flag=False newplist=self.recompose(phrase_list) '''for item in newplist: for item2 in item: print(item2.encode('utf-8'))''' for nnlist in newplist: i=0 tmp_correct=[] correct_num=[] for item in nnlist: py=self.pp.hanzi2pinyin_split(item,'_') tmp=[] tmp_correct.append(tmp) if(py in self.jj_dict): for item2 in self.jj_dict[py]: tmp_correct[i].append(item2) else: tmp_correct[i].append((item,1)) correct_num.append(0) i+=1 length=len(tmp_correct) notend=True while notend: i=0 tmpstr='' score=0 for j in xrange(0,length): tmps=tmp_correct[j][correct_num[j]][0] tmpstr+=tmps score+=int(tmp_correct[j][correct_num[j]][1])*len(tmps)**7 termlist.append(tup(tmpstr,score)) correct_num[0]+=1 while correct_num[i]>=len(tmp_correct[i]): correct_num[i]=0 if i<length-1: correct_num[i+1]+=1 i+=1 else: notend=False result_list=self.sscore(termlist) comstr='' for item in phrase_list: comstr+=item if result_list[0]==comstr: result_list=[] else: if comstr in result_list: result_list.pop(result_list.index(comstr)) return result_list def sscore(self,termlist): heap=[] result_list=[] for item in termlist: heap.append(item) heapq.heapify(heap) while len(heap) > 5: a=heapq.heappop(heap) while len(heap) > 0: result_list.append(heapq.heappop(heap).term) result_list.reverse() return result_list def recompose(self,phrase_list): position=[] attach={} cpl=[] #the consequence:list of list i=0 #position of single word for item in phrase_list: if(len(item)==1): position.append(i) attach[i]=0 i+=1 notend=True length=len(position) if length>0: while notend: gap=0 tmp_list=copy.deepcopy(phrase_list) tmp_position=copy.deepcopy(position) pi=0 while pi < len(tmp_position): item2=tmp_position[pi] if(attach[item2]==0): if item2-1-gap>=0: tmp_list[item2-1-gap]+=tmp_list[item2-gap] k=tmp_position.index(item2) tmp_position.pop(k) tmp_list.pop(item2-gap) gap+=1 else: pi+=1 '''while k < len(tmp_position): tmp_position[k]-=gap #print(tmp_position[k]) attach[tmp_position[k]]=attach[tmp_position[k]+gap] k+=1''' else: if attach[item2]==1: if item2+1-gap<len(tmp_list): tmp_list[item2+1-gap]=tmp_list[item2-gap]+tmp_list[item2+1-gap] k=tmp_position.index(item2) tmp_position.pop(k) tmp_list.pop(item2-gap) gap+=1 if item2+1 in tmp_position: tmp_position.pop(tmp_position.index(item2+1)) else: pi+=1 else: pi+=1 '''while k < len(tmp_position): tmp_position[k]-=gap attach[tmp_position[k]]=attach[tmp_position[k]+gap] k+=1''' '''flag=True for item3 in tmp_list: if len(item3)==1: flag=False if flag:''' if tmp_list not in cpl: cpl.append(tmp_list) attach[position[0]]+=1 #每次变换一个 i=0 while attach[position[i]]>=3: attach[position[i]]=0 if i<length-1: attach[position[i+1]]+=1 i+=1 else: notend=False else: cpl.append(phrase_list) return cpl
#!/usr/bin/env python # -*- coding:utf-8 -*- from pinyin import PinYin test = PinYin() test.load_word() string = "钓鱼岛是中国的" print "out: %s" % test.hanzi2pinyin_split(string=string, split="-")
u"爱奇艺", u"腾讯视频", u"qq", u"熊猫tv", u"快递", u'4399', } word2pinyin = PinYin() word2pinyin.load_word() alphabet = {'a':1, 'b':1, 'c':1, 'd':1, 'e':1, 'f':1, 'g':1, 'h':1, 'i':1, 'j':1, 'k':1, 'l':1, 'm':1, 'n':1, 'o':1, 'p':1, 'q':1, 'r':1, 's':1, 't':1, 'u':1, 'v':1, 'w':1, 'x':1, 'y':1, 'z':1} def hanzi2pinyi(word): result = [] for hanzi in word: if hanzi.lower() in alphabet: result.append(hanzi.lower()) else: result.append(word2pinyin.hanzi2pinyin(hanzi)) return ''.join(result) if __name__ == '__main__': for word in white_list: print word2pinyin.hanzi2pinyin_split(word)
#!/usr/bin/env python # -*- coding:utf-8 -*- from pinyin import PinYin import sys test = PinYin('application/libraries/pinyin.py/word.data') test.load_word() print test.hanzi2pinyin_split(string=sys.argv[1], split='_')
class correction: def __init__(self): self.pp = PinYin() self.pp.load_word() with open('pinyin_dict', 'r') as ff: line = ff.readline() self.jj_dict = json.loads(line) ff.close() def correct(self, phrase_list): termlist = [] flag = False newplist = self.recompose(phrase_list) '''for item in newplist: for item2 in item: print(item2.encode('utf-8'))''' for nnlist in newplist: i = 0 tmp_correct = [] correct_num = [] for item in nnlist: py = self.pp.hanzi2pinyin_split(item, '_') tmp = [] tmp_correct.append(tmp) if (py in self.jj_dict): for item2 in self.jj_dict[py]: tmp_correct[i].append(item2) else: tmp_correct[i].append((item, 1)) correct_num.append(0) i += 1 length = len(tmp_correct) notend = True while notend: i = 0 tmpstr = '' score = 0 for j in xrange(0, length): tmps = tmp_correct[j][correct_num[j]][0] tmpstr += tmps score += int( tmp_correct[j][correct_num[j]][1]) * len(tmps)**7 termlist.append(tup(tmpstr, score)) correct_num[0] += 1 while correct_num[i] >= len(tmp_correct[i]): correct_num[i] = 0 if i < length - 1: correct_num[i + 1] += 1 i += 1 else: notend = False result_list = self.sscore(termlist) comstr = '' for item in phrase_list: comstr += item if result_list[0] == comstr: result_list = [] else: if comstr in result_list: result_list.pop(result_list.index(comstr)) return result_list def sscore(self, termlist): heap = [] result_list = [] for item in termlist: heap.append(item) heapq.heapify(heap) while len(heap) > 5: a = heapq.heappop(heap) while len(heap) > 0: result_list.append(heapq.heappop(heap).term) result_list.reverse() return result_list def recompose(self, phrase_list): position = [] attach = {} cpl = [] #the consequence:list of list i = 0 #position of single word for item in phrase_list: if (len(item) == 1): position.append(i) attach[i] = 0 i += 1 notend = True length = len(position) if length > 0: while notend: gap = 0 tmp_list = copy.deepcopy(phrase_list) tmp_position = copy.deepcopy(position) pi = 0 while pi < len(tmp_position): item2 = tmp_position[pi] if (attach[item2] == 0): if item2 - 1 - gap >= 0: tmp_list[item2 - 1 - gap] += tmp_list[item2 - gap] k = tmp_position.index(item2) tmp_position.pop(k) tmp_list.pop(item2 - gap) gap += 1 else: pi += 1 '''while k < len(tmp_position): tmp_position[k]-=gap #print(tmp_position[k]) attach[tmp_position[k]]=attach[tmp_position[k]+gap] k+=1''' else: if attach[item2] == 1: if item2 + 1 - gap < len(tmp_list): tmp_list[item2 + 1 - gap] = tmp_list[ item2 - gap] + tmp_list[item2 + 1 - gap] k = tmp_position.index(item2) tmp_position.pop(k) tmp_list.pop(item2 - gap) gap += 1 if item2 + 1 in tmp_position: tmp_position.pop( tmp_position.index(item2 + 1)) else: pi += 1 else: pi += 1 '''while k < len(tmp_position): tmp_position[k]-=gap attach[tmp_position[k]]=attach[tmp_position[k]+gap] k+=1''' '''flag=True for item3 in tmp_list: if len(item3)==1: flag=False if flag:''' if tmp_list not in cpl: cpl.append(tmp_list) attach[position[0]] += 1 #每次变换一个 i = 0 while attach[position[i]] >= 3: attach[position[i]] = 0 if i < length - 1: attach[position[i + 1]] += 1 i += 1 else: notend = False else: cpl.append(phrase_list) return cpl
positionTypes[name] = nid # parse file and import data to db fd = open(employee_file, 'r') for line in fd.readlines(): value = {} line = line.strip() items = line.split('\t') name = items[1] no = items[2] if no in nos.keys(): continue username = h2p.hanzi2pinyin_split(string=name, split='') i = 0 while username in userNames.keys(): i = i + 1 username = username + str(i) departmentName = items[3] if departmentName not in departmentTypes.keys(): cursor.execute("insert into DeptType (name) values(%s)", (departmentName)) cursor.execute("select id from DeptType where name=%s", (departmentName)) id = cursor.fetchone()[0] departmentTypes[departmentName] = id department = departmentTypes[departmentName] positionName = items[4] if positionName not in positionTypes.keys():
# -*- coding: utf-8 -*- # from models import Cnword from pinyin import PinYin test = PinYin() test.load_word() print test.hanzi2pinyin(string='钓鱼岛是中国的') print test.hanzi2pinyin_split(string='钓鱼岛是中国的')
def hanziToPinyin(hanzi): test = PinYin() test.load_word() return test.hanzi2pinyin_split(string=hanzi, split="_")