예제 #1
0
def get_pinyin_data():
    global gPinYin
    if gPinYin is None:
        pydatapath =  'pinyin_word.data'
        gPinYin =  PinYin(pydatapath)
        gPinYin.load_word()
    return gPinYin
예제 #2
0
def get_item(marc_no, status=0):
    dict = {}
    test = PinYin()
    test.load_word('word.data')
    hm = requests.get(ourl + 'item.php?marc_no=' + str(marc_no)).text.encode(
        encoder).decode('utf8').replace(' ', '')
    parser = HTMLParser.HTMLParser()
    s1 = parser.unescape(hm)
    static = re.findall('<div id="book_info">(.*?)<div class="clear"></div>',
                        s1, re.S)[0]
    booklist = re.findall('<dl class="booklist">(.*?)</dl>', static, re.S)
    for each in booklist:
        pm = re.findall('<dt>(.*?)</dt>', each, re.S)[0]
        if pm == '':
            continue
        st = re.findall('<dd>(.*?)</dd>', each, re.S)[0]
        try:
            st1 = re.findall('>(.*?)</a>', st, re.S)[0]
        except:
            st1 = st
        pms = test.hanzi2pinyin_split(string=pm, split="",
                                      firstcode=True).replace('/', '')
        dict[pms] = st1
        if status == 1:
            print pm,
            print st1
    return dict
예제 #3
0
def get_pinyin_data():
    global gPinYin
    if gPinYin is None:
        pydatapath = 'pinyin_word.data'
        gPinYin = PinYin(pydatapath)
        gPinYin.load_word()
    return gPinYin
예제 #4
0
파일: zcf520.py 프로젝트: Yuntong/script
def zcf(namelist):
    """
    :param namelist:
    :return If the match returns a list of numbers, else return None:
    """
    nlist = []
    flag = 0
    test = PinYin()
    test.load_word()
    key = raw_input("关键词  :  ")
    for x in range(len(namelist)):
        #print namelist[x]
        t = test.hanzi2pinyin(str(namelist[x]))
        charnum = len(list(namelist[x].decode('utf-8')))
        flag2 = True
        if len(key) == charnum:
            #print str(len(key)) + " " + str(charnum)
            for xx in range(charnum):
                 flag2 = (t[xx][0] == key[xx]) and flag2
        else:
            continue
        if flag2 is True:
            flag += 1
            nlist.append(x)    
    if flag == 0:
        return None
    else:
        return nlist
예제 #5
0
def main():
    mysql = MySQLHander()
    p = PinYin()
    p.load_word()
    with open('video.json') as json_file:
        alldata = json.load(json_file)

    for data in alldata:
        sql = "INSERT INTO resource VALUES (null, '', '{title}', '{desc}', '{thumb}', '{url}',{duration}, {vister}, {likes},{creat_time}, '{up_time}')".format(title=data['name'].encode('utf-8', 'ignore'), desc=data['name'].encode('utf-8', 'ignore'), thumb=data['thumb'], url=data['url'],duration=random.randint(80, 120), vister=random.randint(4500, 9999), likes=random.randint(500, 2000), creat_time=time.time(), up_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
        rid = mysql.insert(sql)

        for tag in xrange(1, 2):
            tagname = data["tag{0}".format(tag)].encode('utf-8', 'ignore')
            sql = "SELECT id from category WHERE cname='{0}'".format(tagname)
            mysql.query(sql)
            result = mysql.fetchOneRow()
            if not result:
                ename = p.hanzi2pinyin_split(string=tagname, split="-").replace('-', '')
                sql = "INSERT INTO category values(null, '{ename}', '{cname}', {time})".format(ename=ename, cname=tagname, time=int(time.time()))
                tagid = mysql.insert(sql)
                sql = "INSERT INTO category_mapping values(null, {cid}, {rid}, {time})".format(cid=tagid, rid=rid, time=int(time.time()))
                mysql.insert(sql)
            else:
                tagid = result[0]
                sql = "INSERT INTO category_mapping values(null, {cid}, {rid}, {time})".format(cid=tagid, rid=rid, time=int(time.time()))
                mysql.insert(sql)

        for pic in data['screen']:
            sql = "INSERT INTO screenshots values(null, {rid}, '{pic}', {time})".format(rid=rid, pic=pic, time=int(time.time()))
            mysql.insert(sql)

        print "{0} done".format(data['thumb'])

    mysql.close()
예제 #6
0
def writeCityName():
    if not os.path.exists('cityName.csv'):
        url = "http://www.zxinc.org/gb2260.htm"
        print 'start reading ...'
        response = urllib.urlopen(url)
        page = response.read()
        page = page.decode('utf8')
        print 'reading done...'
        pattern = re.compile(ur'([\u4e00-\u9fa5]{2,5}市)')
        match = pattern.findall(page)
        if match:
            try:
                with open('cityName.csv', 'wb') as csvfile:
                    csvWrite = csv.writer(csvfile,
                                          delimiter=' ',
                                          quotechar='|',
                                          quoting=csv.QUOTE_MINIMAL)
                    csvfile.write(codecs.BOM_UTF8)
                    test = PinYin()
                    test.load_word()
                    for result in match:
                        result = result.encode('utf8')
                        py = test.hanzi2pinyin(string=result[:-3])
                        csvWrite.writerow([result[:-3], py[-1]])
                print 'write done!'
            except Exception as e:
                print e
            finally:
                csvfile.close()
    else:
        print 'cityName.csv detected'
예제 #7
0
class rhyRobot:
    #if baidu doesnot work.Try use proxy.
    def __init__(self):
        self.pinYinRobot = PinYin()
        self.pinYinRobot.load_word()
        self.shengMu = [
            "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q",
            "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w"
        ]
        self.zhengTi = [
            "zhi", "chi", "shi", "ri", "zi", "ci", "si", "yu", "ye", "yue",
            "yuan", "yin", "yun", "ying"
        ]
        print("pinYinRobot is loaded")

    def findRhyForWords(self, chinese):
        pinYinList = self.pinYinRobot.hanzi2pinyin(chinese)
        for singleWord in pinYinList:
            for zhengTi in self.zhengTi:
                if (singleWord == zhengTi):
                    print singleWord + " is whole,cant rhy"
                    return
        pinYinTuple = self.__findPinYinTuple(pinYinList)
        allPossibleWord = self.__findAllPosiblePinYin(pinYinTuple)
        print allPossibleWord

    def __getResultFromBaidu(self, allPossibleWord):
        pass

    def __getResultFromLocal(self, allPossibleWord):
        pass

    def __findAllPosiblePinYin(self, pinYinTuple):
        shengMuLen = len(self.shengMu)
        myLoopMachine = LoopMachine(len(pinYinTuple), shengMuLen)
        allPossibleWord = []
        while (myLoopMachine.shouldStop()):
            loopIndex = myLoopMachine.getLoopIndex()
            newWord = ''
            appendFlag = True
            for i in range(len(loopIndex)):
                wordToAppend = self.shengMu[loopIndex[i]] + pinYinTuple[i][1]
                if (possibleDict.has_key(wordToAppend) == False):
                    appendFlag = False
                    break
                newWord = newWord + wordToAppend + ' '
            if (appendFlag == True):
                allPossibleWord.append((newWord, 0))
            myLoopMachine.incr()
        return allPossibleWord

    def __findPinYinTuple(self, pinYinList):
        pinYinTuple = []
        for item in pinYinList:
            if (item[:2] == "zh" or item[:2] == "ch" or item[:2] == "sh"):
                pinYinTuple.append((item[:2], item[2:]))
            else:
                pinYinTuple.append((item[:1], item[1:]))
        return pinYinTuple
예제 #8
0
파일: cn2pinyin.py 프로젝트: ddmkchan/Utils
def t2():
    test = PinYin()
    test.load_word()
    #string = u"Kottlers古玩城"
    #string = u"Head 2 Toe发型店"
    #string = u"蓝"
    #print string
    #print test.hanzi2pinyin(string=string)
    #print Cartesian_product(test.hanzi2pinyin(string=string))

    name = u"普季(商城)"
    name = u"Kottlers古玩城"
    name = u"hello 艾压(重庆店)山"
    name = u"库兰达(库兰达热带雨林)"
    #name = u"盛文甘hello店(店)"
    #name = u"义乌三期市场(原篁园市场)"
    print name
    p = re.compile(u'[\u4e00-\u9fa5]+')
    p_eng = re.compile(u'[a-zA-Z]+')
    j = 0
    strs = []
    while (j < len(name)):

        #for j in xrange(len(name)):
        #    if j

        if j + 1 == len(name):
            strs.append(name[j])
        else:
            print(name[j], name[j + 1]), is_hz_py(name[j], name[j + 1])
            if not is_hz_py(name[j], name[j + 1]):
                print name[j], j
                strs.append(name[j] + u" ")
            else:
                strs.append(name[j])
        j += 1
    name = "".join(strs)
    ch_names = p.findall(name)
    tmp = name
    ll = []
    mydict = {}
    cnames = "".join([ch_name for ch_name in ch_names])
    #pys = test.hanzi2pinyin(string=cnames)
    pys = Cartesian_product(test.hanzi2pinyin(string=cnames))
    print cnames, pys, ch_names
    for p in pys:
        tmp2 = name
        for ch_name in ch_names:
            m = re.search(ch_name, cnames)
            _start = m.start()
            _end = m.end()
            replace = " ".join([k for k in p.split()[_start:_end]])
            print _start, _end, replace, tmp2
            tmp2 = re.sub(ch_name, replace, tmp2, 1)
        print tmp2
예제 #9
0
파일: cn2pinyin.py 프로젝트: ddmkchan/Utils
def t2():
    test = PinYin()
    test.load_word()
    #string = u"Kottlers古玩城"
    #string = u"Head 2 Toe发型店"
    #string = u"蓝"
    #print string
    #print test.hanzi2pinyin(string=string)
    #print Cartesian_product(test.hanzi2pinyin(string=string))

    name = u"普季(商城)"
    name = u"Kottlers古玩城"
    name = u"hello 艾压(重庆店)山"
    name = u"库兰达(库兰达热带雨林)"
    #name = u"盛文甘hello店(店)"
    #name = u"义乌三期市场(原篁园市场)"
    print name
    p = re.compile(u'[\u4e00-\u9fa5]+')
    p_eng = re.compile(u'[a-zA-Z]+')
    j = 0
    strs = []
    while (j<len(name)):
        
    #for j in xrange(len(name)):
    #    if j 
        
        if j+1 == len(name):
            strs.append(name[j])
        else:
            print (name[j], name[j+1]), is_hz_py(name[j], name[j+1])
            if not is_hz_py(name[j], name[j+1]):
                print name[j], j
                strs.append(name[j]+u" ")
            else:
                strs.append(name[j])
        j += 1
    name  = "".join(strs)
    ch_names =  p.findall(name)
    tmp = name
    ll = []
    mydict = {}
    cnames = "".join([ch_name for ch_name in ch_names])
    #pys = test.hanzi2pinyin(string=cnames)
    pys = Cartesian_product(test.hanzi2pinyin(string=cnames))
    print cnames, pys, ch_names
    for p in pys:
        tmp2 = name
        for ch_name in ch_names:
            m = re.search(ch_name, cnames)
            _start = m.start()
            _end = m.end()
            replace = " ".join([k for k in p.split()[_start:_end]])
            print _start, _end, replace, tmp2
            tmp2 = re.sub(ch_name, replace, tmp2, 1)
        print tmp2
예제 #10
0
    def get_authors_by_venue(cached_list, cached_set, cdblp_venue, dblp_venue):

        d = DBLPQuery.get_cache('cdblp-pub-cache.data')

        if not d.__contains__(cdblp_venue.get('title')):
            print('This C-DBLP venue is not on file.')
            return

        res = urlopen('http://www.dblp.org/search/api/?q=ce:venue:{}:*&h=750&format=json'.format(dblp_venue.get('title').lower()))
        # fix titles as { "Title ..." }
        fixed_json = re.compile('({\s*)(".+")(\s*})').sub(lambda m: m.group(2), res.read().decode('utf-8'))

        # get publications
        cdblp_pubs = d.get(cdblp_venue.get('title'))
        dblp_pubs = json.loads(fixed_json)

        cdblp_authors = set()
        dblp_authors = set()
        authors = dict()

        #print(type(cdblp_pubs))
        #print(cdblp_pubs.keys())

        for ky in cdblp_pubs.keys():
            for ki in cdblp_pubs.get(ky).keys():
                for pub in cdblp_pubs.get(ky).get(ki):
                    for author in pub.get('authors'):
                        cdblp_authors.add(author)

        for pub in dblp_pubs.get('result').get('hits').get('hit'):
            try:
                for author in pub.get('info').get('authors').get('author'):
                    dblp_authors.add(author)
            except AttributeError:
                print('PublicationException: %s' % pub.get('@id'))

        pinyin = PinYin()
        pinyin.load_word()

        for author in cdblp_authors:
            name_comp = CDBLPAuthor.get_english_name(author, pinyin)
            if name_comp['full_name'] in dblp_authors:
                if authors.__contains__(name_comp['full_name']):
                    authors[name_comp['full_name']]['zh'] = name_comp['zh']
                    authors[name_comp['full_name']]['count'] += 1
                else:
                    authors[name_comp['full_name']] = { 'zh': name_comp['zh'], 'count': 1 }
            elif len(author) == 3 and authors.__contains__(name_comp['full_name_dash']):
                if authors.__contains__(name_comp['full_name_dash']):
                    authors[name_comp['full_name_dash']]['zh'] = name_comp['zh']
                    authors[name_comp['full_name_dash']]['count'] += 1
                else:
                    authors[name_comp['full_name_dash']] = { 'zh': name_comp['zh'], 'count': 1 }

        return authors
예제 #11
0
 def getciyun(self):
     # 得到词云回答者信息
     test1 = PinYin()
     test1.load_word()
     str1 = str(test1.hanzi2pinyin_split(string=str(self.aa.topic), split="-"))
     path1 =  'F:/zhihu/answer/people_qb.txt'
     cloud.ciyun1(path1,str1+'people')
     #得到词云,问题信息
     path2='F:/zhihu/answer/question_top10.txt'
     cloud.ciyun1(path2,str1+'question')
     path2 =  'F:/zhihu/answer/p_location.txt'
     cloud.ciyun1(path2,str1+'slocation')
예제 #12
0
def name_tran(str):
    test = PinYin()
    test.load_word()
    str[0]
    family = test.hanzi2pinyin(string=str[0])[0]
    last = u''
    print(str[1:])
    for word in test.hanzi2pinyin(string=str[1:]):
        last = last + word

    name_en = last.title() + u' ' + family.title()
    return name_en
예제 #13
0
    def _generate_name(self):
        if not self.name and not self.email:
            return []
        result = []

        # true name
        pinyin = PinYin(PINYIN)
        pinyin.load_word()
        name_pinyin_list = map(pinyin.hanzi2pinyin, self.name)
        result.extend(self._format(name_pinyin_list, built_in.name_formats))
        result.extend(self._format(self.username, built_in.general_formats))
        result.extend(self._generate_email())
        return list(set(result))
예제 #14
0
파일: person.py 프로젝트: LiarBing/genpAss
    def _generate_name(self):
        if not self.name and not self.email:
            return []
        result = []

        # true name
        pinyin = PinYin(PINYIN)
        pinyin.load_word()
        name_pinyin_list = map(pinyin.hanzi2pinyin, self.name)
        result.extend(self._format(name_pinyin_list, built_in.name_formats))
        result.extend(self._format(self.username, built_in.general_formats))
        result.extend(self._generate_email())
        return list(set(result))
예제 #15
0
def draw_frame(faces, img, gray, move):

    global xdeg
    global ydeg
    global fps
    global time_t

    if move == 2:
        steering_control(faces, img)
    # Draw a rectangle around every face
    for (x, y, w, h) in faces:

        cv2.rectangle(img, (x, y), (x + w, y + h), (200, 255, 0), 2)
        #-----rec-face
        roi = gray[x:x + w, y:y + h]
        try:
            roi = cv2.resize(roi, (200, 200), interpolation=cv2.INTER_LINEAR)
            params = model.predict(roi)
            if params[1] < 500.0:
                #print (names[params[0]])
                #pec = (' %.2f' % (params[1]))
                #sign = names[params[0]] + pec
                pyin = PinYin()
                pyin.load_word()
                pname = names[params[0]]
                change_identity(pname)
                #pyin.hanzi2pinyin(string = pname)
                pname = pyin.hanzi2pinyin_split(string=pname, split='')
                s = ''
                for p in pname:
                    s = s + p
                sign = ("%s %.2f" % (s, params[1]))
                # print(sign)
                cv2.putText(img, sign, (x, y - 2), cv2.FONT_HERSHEY_SIMPLEX,
                            0.5, (0, 0, 255), 2)
                #img = cv2ImgAddText(img, sign , x , y - 2, (0, 0, 255), 20)
                #img = change_cv2_draw(img,sign,(x, y + 2), 20 , 'firebrick' )

        except:
            continue

    # Calculate and show the FPS
    fps = fps + 1
    sfps = fps / (time.time() - t_start)
    cv2.putText(img, "FPS : " + str(int(sfps)), (10, 15),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    cv2.imshow("recognize-face", img)
예제 #16
0
파일: mall.py 프로젝트: ChenShuyan/pyb2c
def mallSearch():    
    kw = MyRequest.getParams('kw')

    if kw:
        kw = addslashes(kw)
        malls = Mall.query.filter(Mall.keyword.like('%'+kw+'%')).all()
        keyword = Keyword.query.filter_by(keyword=kw).first()
        if keyword:
            if len(malls) > keyword.items:
                keyword.items = len(malls)
            if timetodate(keyword.updatetime,0) == timetodate(g.siteTime,0):
                keyword.month_search += 1
            else:
                keyword.month_search = 1

            if timetodate(keyword.updatetime,8) == timetodate(g.siteTime,8):
                keyword.week_search += 1
            else:
                keyword.week_search = 1

            if timetodate(keyword.updatetime,3) == timetodate(g.siteTime,3):
                keyword.today_search += 1
            else:
                keyword.today_search = 1

            keyword.total_search += 1
            keyword.updatetime = g.siteTime
        else:
            keyword = Keyword()
            keyword.keyword = kw
            keyword.items = len(malls)
            keyword.updatetime = g.siteTime
            keyword.month_search = 1
            keyword.week_search = 1
            keyword.today_search = 1
            keyword.total_search = 1
            py = PinYin(g.rootpath+os.path.sep+'assets'+os.path.sep+'word.data')
            py.load_word()
            keyword.letter = py.hanzi2pinyin_split(string=kw, split=" ")
            db.session.add(keyword)
        db.session.commit()

        rewords = Keyword.query.filter(Keyword.keyword.like('%'+kw+'%')).limit(10).all()

        return render_template('mall/mallSearch.html',malls=malls,g=g,kw=kw,rewords=rewords)
    else:
        return redirect(url_for('index'))
예제 #17
0
파일: index.py 프로젝트: binbin/phone_book
  def post(self):
    p = PinYin(dict_file=os.path.join(os.path.dirname(__file__), 'libs','pinyin','word.data'))
    p.load_word()

    phone = Phone()
    phone.name = cgi.escape(self.request.get("name"))
    phone.phone = int(cgi.escape(self.request.get("phone")))
    phone.department = cgi.escape(self.request.get("department"))
    phone.name_pinyin = ''.join(p.hanzi2pinyin(string=phone.name))
    phone.department_pinyin = ''.join(p.hanzi2pinyin(string=phone.department))
    phone.hire_date = datetime.datetime.now().date()
    phone.put()
    
    

    path = os.path.join(os.path.dirname(__file__), 'templates','success.html')
    self.response.out.write(template.render(path,{}))
예제 #18
0
class changetopinyin:
    wf_dict={}
    
    def __init__(self):
        self.test=PinYin()
        self.test.load_word()


    def change(self,filename):
        with open(filename,'r') as ff:
            for item in ff.readlines():
                word,fre=item.split(' ')[0],int(item.split(' ')[1])
                wf=word_fre(word,fre)
                self.addtodict(wf)
        for item in self.wf_dict.itervalues():
            i=0
            for ii in item:
                try:
                    item[i]=(ii.word,ii.fre)
                except Exception as e:
                    print(e)
                i+=1
        self.save('pinyin_dict2')


    def addtodict(self,wf):
        pp=self.test.hanzi2pinyin_split(wf.word,'_')
        if(pp in self.wf_dict):
            if(len(self.wf_dict[pp])<=5):
                heapq.heappush(self.wf_dict[pp],wf)
            else:
                heapq.heappushpop(self.wf_dict[pp],wf)
        else:
            self.wf_dict[pp]=[]
            self.wf_dict[pp].append(wf)

    def save(self,filename):
        with open(filename,'w') as wff:
            try:
                wff.write(json.dumps(self.wf_dict))
            except Exception as e:
                print(e)
예제 #19
0
    def get_sample_users():
        cache = open('author-cache.data', 'w')
        piy = PinYin()
        piy.load_word()
        author_list = []
        res = urlopen('http://easyscholar.ruc.edu.cn/moreuser.html')
        dom = BeautifulSoup(res)
        author_tags = dom.find_all(href=re.compile('^homepage/'))
        for author_tag in author_tags:
            if author_tag.findChild('strong'):
                #print(author_tag.findChild('strong').contents)
                author_name = CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])
                author_list.append(author_name)
                #print('{} {}'.format(author_name['full_name'], author_name['zh']))
                #print(CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])['full_name'])
                #print(piy.hanzi2pinyin(author_tag.findChild('strong').contents[0]))

        cache.write(json.dumps(author_list))
        cache.close()
        return author_list
예제 #20
0
def idiomFind(x):
    if x == None:
        raise Exception
    else:
        with open('idiom.txt','r') as f:
            base = f.readlines()
            random.shuffle(base)
            j = 0
            for i in base:
                
                c = i[:3].decode('utf8')
                if len(i)>1:
                    try:
                        test = PinYin()
                        test.load_word()
                        py = test.hanzi2pinyin(c)[0]
                        if (py == x):
                            return i
                    except:
                        continue
        return None
예제 #21
0
class changetopinyin:
    wf_dict = {}

    def __init__(self):
        self.test = PinYin()
        self.test.load_word()

    def change(self, filename):
        with open(filename, 'r') as ff:
            for item in ff.readlines():
                word, fre = item.split(' ')[0], int(item.split(' ')[1])
                wf = word_fre(word, fre)
                self.addtodict(wf)
        for item in self.wf_dict.itervalues():
            i = 0
            for ii in item:
                try:
                    item[i] = (ii.word, ii.fre)
                except Exception as e:
                    print(e)
                i += 1
        self.save('pinyin_dict2')

    def addtodict(self, wf):
        pp = self.test.hanzi2pinyin_split(wf.word, '_')
        if (pp in self.wf_dict):
            if (len(self.wf_dict[pp]) <= 5):
                heapq.heappush(self.wf_dict[pp], wf)
            else:
                heapq.heappushpop(self.wf_dict[pp], wf)
        else:
            self.wf_dict[pp] = []
            self.wf_dict[pp].append(wf)

    def save(self, filename):
        with open(filename, 'w') as wff:
            try:
                wff.write(json.dumps(self.wf_dict))
            except Exception as e:
                print(e)
예제 #22
0
파일: person.py 프로젝트: sulinx/genpAss
    def _generate_name(self):
        '''generate passwords fragment from username/real name/email id string

        :return: strings list
        '''
        result = []
        if not any([self.username, self.email, self.name]):
            return result

        # real name
        pinyin = PinYin(PINYIN)
        pinyin.load_word()
        name_pinyin_list = map(pinyin.hanzi2pinyin, self.name)
        result.extend(self._generator(name_pinyin_list, built_in.name_formats))

        # username
        result.extend(self._generator(self.username, built_in.general_formats))

        # email id string
        result.extend(self._generate_email())

        return list(set(result))
    def Convert(self):
        py_engine = PinYin()
        py_engine.load_word()

        contact = list()
        f = open(self.filename,'r')
        for line in open(self.filename):  
            line = f.readline()
            contact.append(line)
            k = re.findall(r"(\N\:[^\;]*\;)", line) 
            if k:
                phones = py_engine.hanzi2pinyin(string=k[0])
                line = "X-PHONETIC-LAST-NAME:"
                for item in phones:
                    if item != '':
                        line = line + item.capitalize()
                line += "\n"
                contact.append(line)

        fout = open(filename, 'w')
        for line in contact:
            fout.write(line)
예제 #24
0
    def Convert(self):
        py_engine = PinYin()
        py_engine.load_word()

        contact = list()
        f = open(self.filename,'r')
        for line in open(self.filename):  
            line = f.readline()
            k = re.findall(r"(\N\:[^\;]*\;[^\;]*\;[^\;]*\;[^\;]*\;)", line) 
            if k:
                if k[0].find(';') - 2 > 3:
                    xing = k[0][2: 5]
                    ming = k[0][5: k[0].find(';')] + k[0][k[0].find(';') + 1 : k[0].find(';', k[0].find(';')+1)]
                else:
                    xing = k[0][2: k[0].find(';') ]
                    ming = k[0][k[0].find(';') + 1 : k[0].find(';', k[0].find(';')+1)]
                contact.append('N:'+xing+';'+ming+';'+";;\n")

                phones = py_engine.hanzi2pinyin(string=xing)
                line = "X-PHONETIC-LAST-NAME:"
                for item in phones:
                    if item != '':
                        line = line + item.capitalize()
                line += "\n"
                contact.append(line)

                phones = py_engine.hanzi2pinyin(string=ming)
                line = "X-PHONETIC-FIRST-NAME:"
                for item in phones:
                    if item != '':
                        line = line + item.capitalize()
                line += "\n"
                contact.append(line)
            else:
                contact.append(line)

        fout = open("ok_"+self.filename, 'w')
        for line in contact:
            fout.write(line)
예제 #25
0
def main(args):

    test = PinYin()
    test.load_word()

    conn = getconn()
    cursor = conn.cursor()
    cursor.execute('select rname,rid from roominfo where py_name is null')
    #    cursor.execute('select cname,area from area_name_map where py_name is null')
    rows = cursor.fetchall()

    for row in rows:
        myword = row[0].encode("utf8")
        pylist = test.hanzi2pinyin(string=myword)
        pystr = pylist[0]
        for w in pylist[1:]:
            pystr = pystr + w[0]
#        cursor.execute('update area_name_map set py_name=? where cname=? and area=?',(pystr,row[0],row[1]))
        cursor.execute('update roominfo set py_name=? where rid=?',
                       (pystr, row[1]))
        conn.commit()
    conn.close()
예제 #26
0
from common import get_response_by_url
from mongoservice import Insert,get_category_by_cid,get_by_pinyin,get_all
from bs4 import BeautifulSoup
from pinyin import PinYin
import  os
_cid = 160
base_url = "http://www.meishij.net/shiliao.php?cid="
s_pinyin = PinYin()
s_pinyin.load_word()

# filepath =os.path.abspath("./1.json")
'''获取 理疗分类'''
def get_meishijie_categories(cid,category_pinyin='',category_cn=''):
    url=base_url+str(cid)
    html =get_html_by_url(url)
    # print(html)
    # soup = BeautifulSoup(html)
    # print(soup)
    # print(soup.prettify())

    sop = BeautifulSoup(html)
    # h = sop.prettify()
    # print( h )
    # head = sop.find('head')
    # print(head)
    # p_categories = sop.findAll(attrs={'id':'listnav_ul'})[0]
    # print(p_categories)

    # dds = sop.select(".listnav_dl_style1 dd a")
    dds = sop.select(".listnav_dl_style1 .current a")
예제 #27
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from pinyin import PinYin

test = PinYin()
test.load_word()
string = "钓鱼岛是中国的"
print "out: %s" % test.hanzi2pinyin_split(string=string, split="-")
# -*- coding: utf-8 -*-
# Author: [email protected]
# Copyright 2015 @ NLPJob

#bug fixed : can be use with pyenv environment
#update: this is Python3 script
#Please modify the pinyin.py of https://github.com/cleverdeng/pinyin.py to Python3 by frederic89

import codecs
import sys

from langconv import *
from pinyin import PinYin
py = PinYin()
py.load_word()

def make_word_4tag(word):
    if len(word) == 0:
        return "N"
    if len(word) == 1:
        return "S"
    else:
        tag = "B"
        for w in word[1:len(word)-1]:
            tag += "M"
        tag += "E"
        return tag

def make_mecab_seed_data(input_file, output_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
    output_data = codecs.open(output_file, 'w', 'utf-8')
예제 #29
0
파일: get_phonetic.py 프로젝트: AmoCat/smp
def get_phonetic(word):
    pinyin = PinYin()
    pinyin.load_word()
    return ''.join(pinyin.hanzipinyin(word))
예제 #30
0
파일: models.py 프로젝트: liutang123/odoo
# -*- coding: utf-8 -*-
import re

from openerp import models, fields, api
from openerp.osv.expression import get_unaccent_wrapper, NEGATIVE_TERM_OPERATORS

from pinyin import PinYin
han2py = PinYin()
han2py.load_word()
# class multiple_name_search(models.Model):
#     _name = 'multiple_name_search.multiple_name_search'

#     name = fields.Char()

# 将name转化为拼音的公共方法
def comman_change_name(name):
    pinyinStr, pyStr = False, False
    if name: #如果有name
        
        pinyinArr = han2py.str2pinyin(name)
        print pinyinArr
        pyStr = ''.join([p[0] for p in pinyinArr])
        pinyinStr = ''.join(pinyinArr)
    return {'pinyin': pinyinStr, 'py': pyStr}

class WithPinyinProductTemplate(models.Model):
    _inherit = 'product.template'
    
    pinyin = fields.Char(string='拼音', help='拼音英语表示,如“名称”的拼音是“mingcheng”') # , default=lambda self: comman_change_name(self.name)['pinyin']
    py = fields.Char(string='拼音首字母', index=True, help='拼音英语表示首字母,如“名称”的拼音是“mingcheng”,则它的拼音首字母则为“mc”') # , default=lambda self: comman_change_name(self.name)['py']
    
예제 #31
0
파일: import.py 프로젝트: simon1024/crm
//默认mobile为11111111111
//默认role为普通员工, 8
//默认mail为[email protected]
//position和department进行关联

############################################################"""



import MySQLdb
import md5
from pinyin import PinYin

# init hanzi2pinyin tool
h2p = PinYin()
h2p.load_word()

# global varialbes
employee_file = "employees"
md5_prefix = 'psd_'
default_pwd = '1q2w3e4r'
default_mobile = '11111111111'
mobile = '11111111111'
role = 8
m = md5.new(md5_prefix + default_pwd)
m.digest()
pwd = m.hexdigest()

# config database
db_host='localhost'
db_user='******'
예제 #32
0
파일: cn2pinyin.py 프로젝트: ddmkchan/Utils
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import re
import xlwt
from pinyin import PinYin, Cartesian_product

test = PinYin()
test.load_word()

wbk = xlwt.Workbook()


def main():
    #files = ['shop_2.csv', 'sight_2.csv']
    files = ['shop_2.csv', 'sight_2.csv', 'district_2_n.csv']
    for f in files:
        func(f)
    wbk.save("/home/chenyp/sharefolder/cn2pinyin.xls")


def func2(filename):
    #餐馆的输入文档
    poitype = filename.split(".")[0].decode('utf-8')
    column2 = u"%sid" % poitype
    count = 0
    lines = open(filename).readlines()[1:100]
    MAX = 35000
    if len(lines) % MAX == 0:
        total = len(lines) / MAX
    else:
예제 #33
0
                user='******',
                passwd='opensesame',
                db='traincrawler',
                port=3306)
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

base_url = "http://trains.ctrip.com/TrainBooking/Ajax/GetTrainDataV2.aspx?DepartureCity=%s&ArrivalCity=%s&DepartureDate=2017-03-30&NO=01"
# post_param = 'http://trains.ctrip.com/TrainBooking/Ajax/SearchListHandler.ashx?Action=getSearchList&value={"IsBus": False, "Filter": "0", "Catalog": "", "IsGaoTie":False, "IsDongChe":False, "CatalogName": "", "DepartureCity": %s, "ArrivalCity": %s, "HubCity": "", "DepartureCityName": %s, "ArrivalCityName": %s, "DepartureDate": "2017-03-24", "DepartureDateReturn": "2017-03-26", "ArrivalDate": "", "TrainNumber": ""}'
base_path = 'xc-price/%s'
getStations_sql = 'select id,begin_stop,begin_alia,end_stop,end_alia from train_stop_20170331_task_xc where task=0 limit 100'
update_sql = 'update train_stop_20170331_task_xc set task = 1 where id =%s'
py_util = PinYin()
py_util.load_word('word.data')


def get(p):
    time.sleep(1)
    content = ''
    try:
        p = p.encode('utf-8')
        response = urllib.urlopen(p)
        content = response.read()
        response.close()
        return content.decode('gb2312')
    except Exception as e:
        print e
        content = '500'
    return content
예제 #34
0
class correction:
    def __init__(self):
        self.pp = PinYin()
        self.pp.load_word()
        with open('pinyin_dict', 'r') as ff:
            line = ff.readline()
            self.jj_dict = json.loads(line)
            ff.close()

    def correct(self, phrase_list):
        termlist = []
        flag = False

        newplist = self.recompose(phrase_list)
        '''for item in newplist:
            for item2 in item:
                print(item2.encode('utf-8'))'''
        for nnlist in newplist:
            i = 0
            tmp_correct = []
            correct_num = []
            for item in nnlist:
                py = self.pp.hanzi2pinyin_split(item, '_')
                tmp = []
                tmp_correct.append(tmp)
                if (py in self.jj_dict):
                    for item2 in self.jj_dict[py]:
                        tmp_correct[i].append(item2)
                else:
                    tmp_correct[i].append((item, 1))
                correct_num.append(0)
                i += 1

            length = len(tmp_correct)
            notend = True
            while notend:
                i = 0
                tmpstr = ''
                score = 0
                for j in xrange(0, length):
                    tmps = tmp_correct[j][correct_num[j]][0]
                    tmpstr += tmps
                    score += int(
                        tmp_correct[j][correct_num[j]][1]) * len(tmps)**7
                termlist.append(tup(tmpstr, score))
                correct_num[0] += 1
                while correct_num[i] >= len(tmp_correct[i]):
                    correct_num[i] = 0
                    if i < length - 1:
                        correct_num[i + 1] += 1
                        i += 1
                    else:
                        notend = False

        result_list = self.sscore(termlist)
        comstr = ''
        for item in phrase_list:
            comstr += item
        if result_list[0] == comstr:
            result_list = []
        else:
            if comstr in result_list:
                result_list.pop(result_list.index(comstr))
        return result_list

    def sscore(self, termlist):
        heap = []
        result_list = []
        for item in termlist:
            heap.append(item)
        heapq.heapify(heap)
        while len(heap) > 5:

            a = heapq.heappop(heap)
        while len(heap) > 0:
            result_list.append(heapq.heappop(heap).term)

        result_list.reverse()
        return result_list

    def recompose(self, phrase_list):
        position = []
        attach = {}
        cpl = []  #the consequence:list of list
        i = 0  #position of single word
        for item in phrase_list:
            if (len(item) == 1):
                position.append(i)
                attach[i] = 0
            i += 1

        notend = True
        length = len(position)
        if length > 0:
            while notend:
                gap = 0
                tmp_list = copy.deepcopy(phrase_list)
                tmp_position = copy.deepcopy(position)
                pi = 0
                while pi < len(tmp_position):
                    item2 = tmp_position[pi]
                    if (attach[item2] == 0):
                        if item2 - 1 - gap >= 0:
                            tmp_list[item2 - 1 - gap] += tmp_list[item2 - gap]
                            k = tmp_position.index(item2)
                            tmp_position.pop(k)
                            tmp_list.pop(item2 - gap)
                            gap += 1
                        else:
                            pi += 1
                            '''while k < len(tmp_position):
                                tmp_position[k]-=gap
#print(tmp_position[k])
                                attach[tmp_position[k]]=attach[tmp_position[k]+gap]
                                k+=1'''
                    else:
                        if attach[item2] == 1:
                            if item2 + 1 - gap < len(tmp_list):
                                tmp_list[item2 + 1 - gap] = tmp_list[
                                    item2 - gap] + tmp_list[item2 + 1 - gap]
                                k = tmp_position.index(item2)
                                tmp_position.pop(k)
                                tmp_list.pop(item2 - gap)
                                gap += 1
                                if item2 + 1 in tmp_position:
                                    tmp_position.pop(
                                        tmp_position.index(item2 + 1))
                            else:
                                pi += 1
                        else:
                            pi += 1
                        '''while k < len(tmp_position):
                            tmp_position[k]-=gap
                            attach[tmp_position[k]]=attach[tmp_position[k]+gap]
                            k+=1'''
                '''flag=True
                for item3 in tmp_list:
                    if len(item3)==1:
                        flag=False
                if flag:'''
                if tmp_list not in cpl:
                    cpl.append(tmp_list)

                attach[position[0]] += 1  #每次变换一个
                i = 0
                while attach[position[i]] >= 3:
                    attach[position[i]] = 0
                    if i < length - 1:
                        attach[position[i + 1]] += 1
                        i += 1
                    else:
                        notend = False
        else:
            cpl.append(phrase_list)
        return cpl
def hanziToPinyin(hanzi):
    test = PinYin()
    test.load_word()
    return test.hanzi2pinyin_split(string=hanzi, split="_")
예제 #36
0
파일: temp.py 프로젝트: lizongwei12/Train
#-*- coding: utf-8 -*-
import sys
sys.path.append("..")
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)
import json
import traceback
from data_service import DataService
from mysql_util import DataBaseUtil
from pinyin import PinYin
py_util = PinYin()
py_util.load_word('../word.data')
def main():
        # 保存站到站
        temp_trains = ['C6903','C6905','C6907','C6909','C6911','C6913','C6915','C6917','C6919','C6921','C6923','C6925','C6953','C6955','C6957','C6959','C6961','C6963','C6965','C6967','C6969','C6971','C6973','C6975','C6902','C6904','C6906','C6908','C6910','C6912','C6918','C6920','C6922','C6924','C6926','C6952','C6954','C6956','C6958','C6960','C6962','C6966','C6968','C6970','C6972','C6974','C6976','C6929','C6914','C6928','C6930','C6978','C6964','C6980','C6977','C6979','C6981']
        for code in temp_trains:
            names = DataBaseUtil.select("select name from train_line_stop where train_code = '%s' order by sequence" % code)
            if len(names) >  0:
                key = 0
                name_list = {}
                for n in names:
                    name_list[key] = n[0]
                    key = key+1
                kvs = name_list.items()
                i = -1
                for ki,vi in kvs:
                    i += 1
                    j = -1
                    for kj,vj in  kvs:
예제 #37
0
# -*- coding: utf-8 -*-
# Author: [email protected]
# Copyright 2015 @ NLPJob

#bug fixed : can be use with pyenv environment
#update: this is Python3 script

import codecs
import sys

from langconv import *
from pinyin import PinYin
py = PinYin()
py.load_word()


def make_word_4tag(word):
    if len(word) == 0:
        return "N"
    if len(word) == 1:
        return "S"
    else:
        tag = "B"
        for w in word[1:len(word) - 1]:
            tag += "M"
        tag += "E"
        return tag


def make_mecab_train_data(input_file, output_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
예제 #38
0
    u"京东",
    u"淘宝",
    u"百度",
    u"微信",
    u"斗鱼",
    u"爱奇艺",
    u"腾讯视频",
    u"qq",
    u"熊猫tv",
    u"快递",
    u'4399',
}


word2pinyin = PinYin()
word2pinyin.load_word()
alphabet = {'a':1, 'b':1, 'c':1, 'd':1, 'e':1, 'f':1, 'g':1,
            'h':1, 'i':1, 'j':1, 'k':1, 'l':1, 'm':1, 'n':1,
            'o':1, 'p':1, 'q':1, 'r':1, 's':1, 't':1,
            'u':1, 'v':1, 'w':1, 'x':1, 'y':1, 'z':1}


def hanzi2pinyi(word):
    result = []
    for hanzi in word:
        if hanzi.lower() in alphabet:
            result.append(hanzi.lower())
        else:
            result.append(word2pinyin.hanzi2pinyin(hanzi))
    return ''.join(result)
예제 #39
0
class correction:
    
    def __init__(self):
        self.pp=PinYin()
        self.pp.load_word()
        with open('pinyin_dict','r') as ff:
            line=ff.readline()
            self.jj_dict=json.loads(line)
            ff.close()


    def correct(self,phrase_list):
        termlist=[]
        flag=False

        newplist=self.recompose(phrase_list)
        '''for item in newplist:
            for item2 in item:
                print(item2.encode('utf-8'))'''
        for nnlist in newplist:
            i=0
            tmp_correct=[]
            correct_num=[]
            for item in nnlist:
                py=self.pp.hanzi2pinyin_split(item,'_')
                tmp=[]
                tmp_correct.append(tmp)
                if(py in self.jj_dict):
                    for item2 in self.jj_dict[py]:
                        tmp_correct[i].append(item2)
                else:
                    tmp_correct[i].append((item,1))
                correct_num.append(0)
                i+=1
            
            length=len(tmp_correct)
            notend=True
            while notend:
                i=0
                tmpstr=''
                score=0
                for j in xrange(0,length):
                    tmps=tmp_correct[j][correct_num[j]][0]
                    tmpstr+=tmps
                    score+=int(tmp_correct[j][correct_num[j]][1])*len(tmps)**7
                termlist.append(tup(tmpstr,score))
                correct_num[0]+=1
                while correct_num[i]>=len(tmp_correct[i]):
                    correct_num[i]=0
                    if i<length-1:
                        correct_num[i+1]+=1
                        i+=1
                    else:
                        notend=False
        
        result_list=self.sscore(termlist)
        comstr=''
        for item in phrase_list:
            comstr+=item
        if result_list[0]==comstr:
            result_list=[]
        else:
            if comstr in result_list:
                result_list.pop(result_list.index(comstr))
        return result_list


    def sscore(self,termlist):
        heap=[]
        result_list=[]
        for item in termlist:
            heap.append(item)
        heapq.heapify(heap)
        while len(heap) > 5:
            
            a=heapq.heappop(heap)
        while len(heap) > 0:
            result_list.append(heapq.heappop(heap).term)
        
        result_list.reverse()
        return result_list

    def recompose(self,phrase_list):
        position=[]
        attach={}
        cpl=[]    #the consequence:list of list 
        i=0       #position of single word
        for item in phrase_list:
            if(len(item)==1):
                position.append(i)
                attach[i]=0
            i+=1

        notend=True
        length=len(position)
        if length>0:
            while notend:
                gap=0
                tmp_list=copy.deepcopy(phrase_list)
                tmp_position=copy.deepcopy(position)
                pi=0
                while pi < len(tmp_position):
                    item2=tmp_position[pi]
                    if(attach[item2]==0):
                        if item2-1-gap>=0:
                            tmp_list[item2-1-gap]+=tmp_list[item2-gap]
                            k=tmp_position.index(item2)
                            tmp_position.pop(k)
                            tmp_list.pop(item2-gap)
                            gap+=1
                        else:
                            pi+=1
                            '''while k < len(tmp_position):
                                tmp_position[k]-=gap
#print(tmp_position[k])
                                attach[tmp_position[k]]=attach[tmp_position[k]+gap]
                                k+=1'''
                    else:
                        if attach[item2]==1:
                            if item2+1-gap<len(tmp_list):
                                tmp_list[item2+1-gap]=tmp_list[item2-gap]+tmp_list[item2+1-gap]
                                k=tmp_position.index(item2)
                                tmp_position.pop(k)
                                tmp_list.pop(item2-gap)
                                gap+=1
                                if item2+1 in tmp_position:
                                    tmp_position.pop(tmp_position.index(item2+1))
                            else:
                                pi+=1
                        else:
                             pi+=1
                        '''while k < len(tmp_position):
                            tmp_position[k]-=gap
                            attach[tmp_position[k]]=attach[tmp_position[k]+gap]
                            k+=1'''
                '''flag=True
                for item3 in tmp_list:
                    if len(item3)==1:
                        flag=False
                if flag:'''
                if tmp_list not in cpl:
                    cpl.append(tmp_list)

                attach[position[0]]+=1 #每次变换一个
                i=0
                while attach[position[i]]>=3:
                        attach[position[i]]=0
                        if i<length-1:
                            attach[position[i+1]]+=1
                            i+=1
                        else:
                            notend=False
        else:
            cpl.append(phrase_list)
        return cpl
예제 #40
0
    for ch in check_str.decode('utf-8'):
        if ch <= u'\u4e00' or ch >= u'\u9fff':
            return True
    return False


# import city name into pandas Dataframe
with open('ChinaCityList.json') as json_data:
    d = json.load(json_data)
# extract city name into list
city = json_normalize(data=d, record_path=['city', 'county'])
city_name = city.name.tolist()
city_name = [x.encode('utf-8') for x in city_name]
# Build a dictionary in the form of city:pinying
trans = PinYin()
trans.load_word()
to_py = trans.hanzi2pinyin
city_py = [to_py(x) for x in city_name]
city_dict = dict(zip(city_name, city_py))


# city chain game
def city_chain(city):
    if len(city) == 0:
        print '错误:请确认是否输入汉字'
    elif check_contain_english(city):
        print '错误:请确认是否输入了非汉字'
    else:
        candidate = []
        py_city = to_py(city)
        py_last_word = py_city[len(py_city) - 1]
예제 #41
0
파일: models.py 프로젝트: liutang123/odoo
# -*- coding: utf-8 -*-
import re

from openerp import models, fields, api
from openerp.osv.expression import get_unaccent_wrapper, NEGATIVE_TERM_OPERATORS

from pinyin import PinYin
han2py = PinYin()
han2py.load_word()

# class multiple_name_search(models.Model):
#     _name = 'multiple_name_search.multiple_name_search'

#     name = fields.Char()


# 将name转化为拼音的公共方法
def comman_change_name(name):
    pinyinStr, pyStr = False, False
    if name:  #如果有name

        pinyinArr = han2py.str2pinyin(name)
        print pinyinArr
        pyStr = ''.join([p[0] for p in pinyinArr])
        pinyinStr = ''.join(pinyinArr)
    return {'pinyin': pinyinStr, 'py': pyStr}


class WithPinyinProductTemplate(models.Model):
    _inherit = 'product.template'
예제 #42
0
def getPinYin(hanzi):
    test = PinYin()
    test.load_word()
    return test.hanzi2pinyin(string=hanzi)