예제 #1
0
def get_pinyin_data():
    global gPinYin
    if gPinYin is None:
        pydatapath = 'pinyin_word.data'
        gPinYin = PinYin(pydatapath)
        gPinYin.load_word()
    return gPinYin
예제 #2
0
def get_item(marc_no, status=0):
    dict = {}
    test = PinYin()
    test.load_word('word.data')
    hm = requests.get(ourl + 'item.php?marc_no=' + str(marc_no)).text.encode(
        encoder).decode('utf8').replace(' ', '')
    parser = HTMLParser.HTMLParser()
    s1 = parser.unescape(hm)
    static = re.findall('<div id="book_info">(.*?)<div class="clear"></div>',
                        s1, re.S)[0]
    booklist = re.findall('<dl class="booklist">(.*?)</dl>', static, re.S)
    for each in booklist:
        pm = re.findall('<dt>(.*?)</dt>', each, re.S)[0]
        if pm == '':
            continue
        st = re.findall('<dd>(.*?)</dd>', each, re.S)[0]
        try:
            st1 = re.findall('>(.*?)</a>', st, re.S)[0]
        except:
            st1 = st
        pms = test.hanzi2pinyin_split(string=pm, split="",
                                      firstcode=True).replace('/', '')
        dict[pms] = st1
        if status == 1:
            print pm,
            print st1
    return dict
예제 #3
0
파일: zcf520.py 프로젝트: Yuntong/script
def zcf(namelist):
    """
    :param namelist:
    :return If the match returns a list of numbers, else return None:
    """
    nlist = []
    flag = 0
    test = PinYin()
    test.load_word()
    key = raw_input("关键词  :  ")
    for x in range(len(namelist)):
        #print namelist[x]
        t = test.hanzi2pinyin(str(namelist[x]))
        charnum = len(list(namelist[x].decode('utf-8')))
        flag2 = True
        if len(key) == charnum:
            #print str(len(key)) + " " + str(charnum)
            for xx in range(charnum):
                 flag2 = (t[xx][0] == key[xx]) and flag2
        else:
            continue
        if flag2 is True:
            flag += 1
            nlist.append(x)    
    if flag == 0:
        return None
    else:
        return nlist
예제 #4
0
 def __init__(self):
     self.pp = PinYin()
     self.pp.load_word()
     with open('pinyin_dict', 'r') as ff:
         line = ff.readline()
         self.jj_dict = json.loads(line)
         ff.close()
예제 #5
0
def main():
    mysql = MySQLHander()
    p = PinYin()
    p.load_word()
    with open('video.json') as json_file:
        alldata = json.load(json_file)

    for data in alldata:
        sql = "INSERT INTO resource VALUES (null, '', '{title}', '{desc}', '{thumb}', '{url}',{duration}, {vister}, {likes},{creat_time}, '{up_time}')".format(title=data['name'].encode('utf-8', 'ignore'), desc=data['name'].encode('utf-8', 'ignore'), thumb=data['thumb'], url=data['url'],duration=random.randint(80, 120), vister=random.randint(4500, 9999), likes=random.randint(500, 2000), creat_time=time.time(), up_time=time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
        rid = mysql.insert(sql)

        for tag in xrange(1, 2):
            tagname = data["tag{0}".format(tag)].encode('utf-8', 'ignore')
            sql = "SELECT id from category WHERE cname='{0}'".format(tagname)
            mysql.query(sql)
            result = mysql.fetchOneRow()
            if not result:
                ename = p.hanzi2pinyin_split(string=tagname, split="-").replace('-', '')
                sql = "INSERT INTO category values(null, '{ename}', '{cname}', {time})".format(ename=ename, cname=tagname, time=int(time.time()))
                tagid = mysql.insert(sql)
                sql = "INSERT INTO category_mapping values(null, {cid}, {rid}, {time})".format(cid=tagid, rid=rid, time=int(time.time()))
                mysql.insert(sql)
            else:
                tagid = result[0]
                sql = "INSERT INTO category_mapping values(null, {cid}, {rid}, {time})".format(cid=tagid, rid=rid, time=int(time.time()))
                mysql.insert(sql)

        for pic in data['screen']:
            sql = "INSERT INTO screenshots values(null, {rid}, '{pic}', {time})".format(rid=rid, pic=pic, time=int(time.time()))
            mysql.insert(sql)

        print "{0} done".format(data['thumb'])

    mysql.close()
예제 #6
0
def writeCityName():
    if not os.path.exists('cityName.csv'):
        url = "http://www.zxinc.org/gb2260.htm"
        print 'start reading ...'
        response = urllib.urlopen(url)
        page = response.read()
        page = page.decode('utf8')
        print 'reading done...'
        pattern = re.compile(ur'([\u4e00-\u9fa5]{2,5}市)')
        match = pattern.findall(page)
        if match:
            try:
                with open('cityName.csv', 'wb') as csvfile:
                    csvWrite = csv.writer(csvfile,
                                          delimiter=' ',
                                          quotechar='|',
                                          quoting=csv.QUOTE_MINIMAL)
                    csvfile.write(codecs.BOM_UTF8)
                    test = PinYin()
                    test.load_word()
                    for result in match:
                        result = result.encode('utf8')
                        py = test.hanzi2pinyin(string=result[:-3])
                        csvWrite.writerow([result[:-3], py[-1]])
                print 'write done!'
            except Exception as e:
                print e
            finally:
                csvfile.close()
    else:
        print 'cityName.csv detected'
예제 #7
0
def get_pinyin_data():
    global gPinYin
    if gPinYin is None:
        pydatapath =  'pinyin_word.data'
        gPinYin =  PinYin(pydatapath)
        gPinYin.load_word()
    return gPinYin
예제 #8
0
class rhyRobot:
    #if baidu doesnot work.Try use proxy.
    def __init__(self):
        self.pinYinRobot = PinYin()
        self.pinYinRobot.load_word()
        self.shengMu = [
            "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q",
            "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w"
        ]
        self.zhengTi = [
            "zhi", "chi", "shi", "ri", "zi", "ci", "si", "yu", "ye", "yue",
            "yuan", "yin", "yun", "ying"
        ]
        print("pinYinRobot is loaded")

    def findRhyForWords(self, chinese):
        pinYinList = self.pinYinRobot.hanzi2pinyin(chinese)
        for singleWord in pinYinList:
            for zhengTi in self.zhengTi:
                if (singleWord == zhengTi):
                    print singleWord + " is whole,cant rhy"
                    return
        pinYinTuple = self.__findPinYinTuple(pinYinList)
        allPossibleWord = self.__findAllPosiblePinYin(pinYinTuple)
        print allPossibleWord

    def __getResultFromBaidu(self, allPossibleWord):
        pass

    def __getResultFromLocal(self, allPossibleWord):
        pass

    def __findAllPosiblePinYin(self, pinYinTuple):
        shengMuLen = len(self.shengMu)
        myLoopMachine = LoopMachine(len(pinYinTuple), shengMuLen)
        allPossibleWord = []
        while (myLoopMachine.shouldStop()):
            loopIndex = myLoopMachine.getLoopIndex()
            newWord = ''
            appendFlag = True
            for i in range(len(loopIndex)):
                wordToAppend = self.shengMu[loopIndex[i]] + pinYinTuple[i][1]
                if (possibleDict.has_key(wordToAppend) == False):
                    appendFlag = False
                    break
                newWord = newWord + wordToAppend + ' '
            if (appendFlag == True):
                allPossibleWord.append((newWord, 0))
            myLoopMachine.incr()
        return allPossibleWord

    def __findPinYinTuple(self, pinYinList):
        pinYinTuple = []
        for item in pinYinList:
            if (item[:2] == "zh" or item[:2] == "ch" or item[:2] == "sh"):
                pinYinTuple.append((item[:2], item[2:]))
            else:
                pinYinTuple.append((item[:1], item[1:]))
        return pinYinTuple
예제 #9
0
 def __init__(self):
     self.pa=Parser()
     self.pp=PinYin()
     self.pp.load_word()
     with open(os.path.join(os.path.dirname(__file__),'pinyin_dict'),'r') as ff:
         line=ff.readline()
         self.jj_dict=json.loads(line)
         ff.close()
예제 #10
0
파일: cn2pinyin.py 프로젝트: ddmkchan/Utils
def t2():
    test = PinYin()
    test.load_word()
    #string = u"Kottlers古玩城"
    #string = u"Head 2 Toe发型店"
    #string = u"蓝"
    #print string
    #print test.hanzi2pinyin(string=string)
    #print Cartesian_product(test.hanzi2pinyin(string=string))

    name = u"普季(商城)"
    name = u"Kottlers古玩城"
    name = u"hello 艾压(重庆店)山"
    name = u"库兰达(库兰达热带雨林)"
    #name = u"盛文甘hello店(店)"
    #name = u"义乌三期市场(原篁园市场)"
    print name
    p = re.compile(u'[\u4e00-\u9fa5]+')
    p_eng = re.compile(u'[a-zA-Z]+')
    j = 0
    strs = []
    while (j < len(name)):

        #for j in xrange(len(name)):
        #    if j

        if j + 1 == len(name):
            strs.append(name[j])
        else:
            print(name[j], name[j + 1]), is_hz_py(name[j], name[j + 1])
            if not is_hz_py(name[j], name[j + 1]):
                print name[j], j
                strs.append(name[j] + u" ")
            else:
                strs.append(name[j])
        j += 1
    name = "".join(strs)
    ch_names = p.findall(name)
    tmp = name
    ll = []
    mydict = {}
    cnames = "".join([ch_name for ch_name in ch_names])
    #pys = test.hanzi2pinyin(string=cnames)
    pys = Cartesian_product(test.hanzi2pinyin(string=cnames))
    print cnames, pys, ch_names
    for p in pys:
        tmp2 = name
        for ch_name in ch_names:
            m = re.search(ch_name, cnames)
            _start = m.start()
            _end = m.end()
            replace = " ".join([k for k in p.split()[_start:_end]])
            print _start, _end, replace, tmp2
            tmp2 = re.sub(ch_name, replace, tmp2, 1)
        print tmp2
예제 #11
0
    def get_authors_by_venue(cached_list, cached_set, cdblp_venue, dblp_venue):

        d = DBLPQuery.get_cache('cdblp-pub-cache.data')

        if not d.__contains__(cdblp_venue.get('title')):
            print('This C-DBLP venue is not on file.')
            return

        res = urlopen('http://www.dblp.org/search/api/?q=ce:venue:{}:*&h=750&format=json'.format(dblp_venue.get('title').lower()))
        # fix titles as { "Title ..." }
        fixed_json = re.compile('({\s*)(".+")(\s*})').sub(lambda m: m.group(2), res.read().decode('utf-8'))

        # get publications
        cdblp_pubs = d.get(cdblp_venue.get('title'))
        dblp_pubs = json.loads(fixed_json)

        cdblp_authors = set()
        dblp_authors = set()
        authors = dict()

        #print(type(cdblp_pubs))
        #print(cdblp_pubs.keys())

        for ky in cdblp_pubs.keys():
            for ki in cdblp_pubs.get(ky).keys():
                for pub in cdblp_pubs.get(ky).get(ki):
                    for author in pub.get('authors'):
                        cdblp_authors.add(author)

        for pub in dblp_pubs.get('result').get('hits').get('hit'):
            try:
                for author in pub.get('info').get('authors').get('author'):
                    dblp_authors.add(author)
            except AttributeError:
                print('PublicationException: %s' % pub.get('@id'))

        pinyin = PinYin()
        pinyin.load_word()

        for author in cdblp_authors:
            name_comp = CDBLPAuthor.get_english_name(author, pinyin)
            if name_comp['full_name'] in dblp_authors:
                if authors.__contains__(name_comp['full_name']):
                    authors[name_comp['full_name']]['zh'] = name_comp['zh']
                    authors[name_comp['full_name']]['count'] += 1
                else:
                    authors[name_comp['full_name']] = { 'zh': name_comp['zh'], 'count': 1 }
            elif len(author) == 3 and authors.__contains__(name_comp['full_name_dash']):
                if authors.__contains__(name_comp['full_name_dash']):
                    authors[name_comp['full_name_dash']]['zh'] = name_comp['zh']
                    authors[name_comp['full_name_dash']]['count'] += 1
                else:
                    authors[name_comp['full_name_dash']] = { 'zh': name_comp['zh'], 'count': 1 }

        return authors
예제 #12
0
파일: cn2pinyin.py 프로젝트: ddmkchan/Utils
def t2():
    test = PinYin()
    test.load_word()
    #string = u"Kottlers古玩城"
    #string = u"Head 2 Toe发型店"
    #string = u"蓝"
    #print string
    #print test.hanzi2pinyin(string=string)
    #print Cartesian_product(test.hanzi2pinyin(string=string))

    name = u"普季(商城)"
    name = u"Kottlers古玩城"
    name = u"hello 艾压(重庆店)山"
    name = u"库兰达(库兰达热带雨林)"
    #name = u"盛文甘hello店(店)"
    #name = u"义乌三期市场(原篁园市场)"
    print name
    p = re.compile(u'[\u4e00-\u9fa5]+')
    p_eng = re.compile(u'[a-zA-Z]+')
    j = 0
    strs = []
    while (j<len(name)):
        
    #for j in xrange(len(name)):
    #    if j 
        
        if j+1 == len(name):
            strs.append(name[j])
        else:
            print (name[j], name[j+1]), is_hz_py(name[j], name[j+1])
            if not is_hz_py(name[j], name[j+1]):
                print name[j], j
                strs.append(name[j]+u" ")
            else:
                strs.append(name[j])
        j += 1
    name  = "".join(strs)
    ch_names =  p.findall(name)
    tmp = name
    ll = []
    mydict = {}
    cnames = "".join([ch_name for ch_name in ch_names])
    #pys = test.hanzi2pinyin(string=cnames)
    pys = Cartesian_product(test.hanzi2pinyin(string=cnames))
    print cnames, pys, ch_names
    for p in pys:
        tmp2 = name
        for ch_name in ch_names:
            m = re.search(ch_name, cnames)
            _start = m.start()
            _end = m.end()
            replace = " ".join([k for k in p.split()[_start:_end]])
            print _start, _end, replace, tmp2
            tmp2 = re.sub(ch_name, replace, tmp2, 1)
        print tmp2
예제 #13
0
 def getciyun(self):
     # 得到词云回答者信息
     test1 = PinYin()
     test1.load_word()
     str1 = str(test1.hanzi2pinyin_split(string=str(self.aa.topic), split="-"))
     path1 =  'F:/zhihu/answer/people_qb.txt'
     cloud.ciyun1(path1,str1+'people')
     #得到词云,问题信息
     path2='F:/zhihu/answer/question_top10.txt'
     cloud.ciyun1(path2,str1+'question')
     path2 =  'F:/zhihu/answer/p_location.txt'
     cloud.ciyun1(path2,str1+'slocation')
예제 #14
0
 def __init__(self):
     self.pinYinRobot = PinYin()
     self.pinYinRobot.load_word()
     self.shengMu = [
         "b", "p", "m", "f", "d", "t", "n", "l", "g", "k", "h", "j", "q",
         "x", "zh", "ch", "sh", "r", "z", "c", "s", "y", "w"
     ]
     self.zhengTi = [
         "zhi", "chi", "shi", "ri", "zi", "ci", "si", "yu", "ye", "yue",
         "yuan", "yin", "yun", "ying"
     ]
     print("pinYinRobot is loaded")
예제 #15
0
    def _generate_name(self):
        if not self.name and not self.email:
            return []
        result = []

        # true name
        pinyin = PinYin(PINYIN)
        pinyin.load_word()
        name_pinyin_list = map(pinyin.hanzi2pinyin, self.name)
        result.extend(self._format(name_pinyin_list, built_in.name_formats))
        result.extend(self._format(self.username, built_in.general_formats))
        result.extend(self._generate_email())
        return list(set(result))
예제 #16
0
파일: person.py 프로젝트: LiarBing/genpAss
    def _generate_name(self):
        if not self.name and not self.email:
            return []
        result = []

        # true name
        pinyin = PinYin(PINYIN)
        pinyin.load_word()
        name_pinyin_list = map(pinyin.hanzi2pinyin, self.name)
        result.extend(self._format(name_pinyin_list, built_in.name_formats))
        result.extend(self._format(self.username, built_in.general_formats))
        result.extend(self._generate_email())
        return list(set(result))
예제 #17
0
def draw_frame(faces, img, gray, move):

    global xdeg
    global ydeg
    global fps
    global time_t

    if move == 2:
        steering_control(faces, img)
    # Draw a rectangle around every face
    for (x, y, w, h) in faces:

        cv2.rectangle(img, (x, y), (x + w, y + h), (200, 255, 0), 2)
        #-----rec-face
        roi = gray[x:x + w, y:y + h]
        try:
            roi = cv2.resize(roi, (200, 200), interpolation=cv2.INTER_LINEAR)
            params = model.predict(roi)
            if params[1] < 500.0:
                #print (names[params[0]])
                #pec = (' %.2f' % (params[1]))
                #sign = names[params[0]] + pec
                pyin = PinYin()
                pyin.load_word()
                pname = names[params[0]]
                change_identity(pname)
                #pyin.hanzi2pinyin(string = pname)
                pname = pyin.hanzi2pinyin_split(string=pname, split='')
                s = ''
                for p in pname:
                    s = s + p
                sign = ("%s %.2f" % (s, params[1]))
                # print(sign)
                cv2.putText(img, sign, (x, y - 2), cv2.FONT_HERSHEY_SIMPLEX,
                            0.5, (0, 0, 255), 2)
                #img = cv2ImgAddText(img, sign , x , y - 2, (0, 0, 255), 20)
                #img = change_cv2_draw(img,sign,(x, y + 2), 20 , 'firebrick' )

        except:
            continue

    # Calculate and show the FPS
    fps = fps + 1
    sfps = fps / (time.time() - t_start)
    cv2.putText(img, "FPS : " + str(int(sfps)), (10, 15),
                cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    cv2.imshow("recognize-face", img)
예제 #18
0
 def __init__(self):
     self.pp=PinYin()
     self.pp.load_word()
     with open('pinyin_dict','r') as ff:
         line=ff.readline()
         self.jj_dict=json.loads(line)
         ff.close()
예제 #19
0
파일: mall.py 프로젝트: ChenShuyan/pyb2c
def mallSearch():    
    kw = MyRequest.getParams('kw')

    if kw:
        kw = addslashes(kw)
        malls = Mall.query.filter(Mall.keyword.like('%'+kw+'%')).all()
        keyword = Keyword.query.filter_by(keyword=kw).first()
        if keyword:
            if len(malls) > keyword.items:
                keyword.items = len(malls)
            if timetodate(keyword.updatetime,0) == timetodate(g.siteTime,0):
                keyword.month_search += 1
            else:
                keyword.month_search = 1

            if timetodate(keyword.updatetime,8) == timetodate(g.siteTime,8):
                keyword.week_search += 1
            else:
                keyword.week_search = 1

            if timetodate(keyword.updatetime,3) == timetodate(g.siteTime,3):
                keyword.today_search += 1
            else:
                keyword.today_search = 1

            keyword.total_search += 1
            keyword.updatetime = g.siteTime
        else:
            keyword = Keyword()
            keyword.keyword = kw
            keyword.items = len(malls)
            keyword.updatetime = g.siteTime
            keyword.month_search = 1
            keyword.week_search = 1
            keyword.today_search = 1
            keyword.total_search = 1
            py = PinYin(g.rootpath+os.path.sep+'assets'+os.path.sep+'word.data')
            py.load_word()
            keyword.letter = py.hanzi2pinyin_split(string=kw, split=" ")
            db.session.add(keyword)
        db.session.commit()

        rewords = Keyword.query.filter(Keyword.keyword.like('%'+kw+'%')).limit(10).all()

        return render_template('mall/mallSearch.html',malls=malls,g=g,kw=kw,rewords=rewords)
    else:
        return redirect(url_for('index'))
예제 #20
0
파일: index.py 프로젝트: binbin/phone_book
  def post(self):
    p = PinYin(dict_file=os.path.join(os.path.dirname(__file__), 'libs','pinyin','word.data'))
    p.load_word()

    phone = Phone()
    phone.name = cgi.escape(self.request.get("name"))
    phone.phone = int(cgi.escape(self.request.get("phone")))
    phone.department = cgi.escape(self.request.get("department"))
    phone.name_pinyin = ''.join(p.hanzi2pinyin(string=phone.name))
    phone.department_pinyin = ''.join(p.hanzi2pinyin(string=phone.department))
    phone.hire_date = datetime.datetime.now().date()
    phone.put()
    
    

    path = os.path.join(os.path.dirname(__file__), 'templates','success.html')
    self.response.out.write(template.render(path,{}))
예제 #21
0
class changetopinyin:
    wf_dict={}
    
    def __init__(self):
        self.test=PinYin()
        self.test.load_word()


    def change(self,filename):
        with open(filename,'r') as ff:
            for item in ff.readlines():
                word,fre=item.split(' ')[0],int(item.split(' ')[1])
                wf=word_fre(word,fre)
                self.addtodict(wf)
        for item in self.wf_dict.itervalues():
            i=0
            for ii in item:
                try:
                    item[i]=(ii.word,ii.fre)
                except Exception as e:
                    print(e)
                i+=1
        self.save('pinyin_dict2')


    def addtodict(self,wf):
        pp=self.test.hanzi2pinyin_split(wf.word,'_')
        if(pp in self.wf_dict):
            if(len(self.wf_dict[pp])<=5):
                heapq.heappush(self.wf_dict[pp],wf)
            else:
                heapq.heappushpop(self.wf_dict[pp],wf)
        else:
            self.wf_dict[pp]=[]
            self.wf_dict[pp].append(wf)

    def save(self,filename):
        with open(filename,'w') as wff:
            try:
                wff.write(json.dumps(self.wf_dict))
            except Exception as e:
                print(e)
예제 #22
0
    def get_sample_users():
        cache = open('author-cache.data', 'w')
        piy = PinYin()
        piy.load_word()
        author_list = []
        res = urlopen('http://easyscholar.ruc.edu.cn/moreuser.html')
        dom = BeautifulSoup(res)
        author_tags = dom.find_all(href=re.compile('^homepage/'))
        for author_tag in author_tags:
            if author_tag.findChild('strong'):
                #print(author_tag.findChild('strong').contents)
                author_name = CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])
                author_list.append(author_name)
                #print('{} {}'.format(author_name['full_name'], author_name['zh']))
                #print(CDBLPAuthor.getEnglishName(author_tag.findChild('strong').contents[0])['full_name'])
                #print(piy.hanzi2pinyin(author_tag.findChild('strong').contents[0]))

        cache.write(json.dumps(author_list))
        cache.close()
        return author_list
예제 #23
0
def idiomFind(x):
    if x == None:
        raise Exception
    else:
        with open('idiom.txt','r') as f:
            base = f.readlines()
            random.shuffle(base)
            j = 0
            for i in base:
                
                c = i[:3].decode('utf8')
                if len(i)>1:
                    try:
                        test = PinYin()
                        test.load_word()
                        py = test.hanzi2pinyin(c)[0]
                        if (py == x):
                            return i
                    except:
                        continue
        return None
예제 #24
0
class changetopinyin:
    wf_dict = {}

    def __init__(self):
        self.test = PinYin()
        self.test.load_word()

    def change(self, filename):
        with open(filename, 'r') as ff:
            for item in ff.readlines():
                word, fre = item.split(' ')[0], int(item.split(' ')[1])
                wf = word_fre(word, fre)
                self.addtodict(wf)
        for item in self.wf_dict.itervalues():
            i = 0
            for ii in item:
                try:
                    item[i] = (ii.word, ii.fre)
                except Exception as e:
                    print(e)
                i += 1
        self.save('pinyin_dict2')

    def addtodict(self, wf):
        pp = self.test.hanzi2pinyin_split(wf.word, '_')
        if (pp in self.wf_dict):
            if (len(self.wf_dict[pp]) <= 5):
                heapq.heappush(self.wf_dict[pp], wf)
            else:
                heapq.heappushpop(self.wf_dict[pp], wf)
        else:
            self.wf_dict[pp] = []
            self.wf_dict[pp].append(wf)

    def save(self, filename):
        with open(filename, 'w') as wff:
            try:
                wff.write(json.dumps(self.wf_dict))
            except Exception as e:
                print(e)
    def Convert(self):
        py_engine = PinYin()
        py_engine.load_word()

        contact = list()
        f = open(self.filename,'r')
        for line in open(self.filename):  
            line = f.readline()
            contact.append(line)
            k = re.findall(r"(\N\:[^\;]*\;)", line) 
            if k:
                phones = py_engine.hanzi2pinyin(string=k[0])
                line = "X-PHONETIC-LAST-NAME:"
                for item in phones:
                    if item != '':
                        line = line + item.capitalize()
                line += "\n"
                contact.append(line)

        fout = open(filename, 'w')
        for line in contact:
            fout.write(line)
예제 #26
0
파일: person.py 프로젝트: sulinx/genpAss
    def _generate_name(self):
        '''generate passwords fragment from username/real name/email id string

        :return: strings list
        '''
        result = []
        if not any([self.username, self.email, self.name]):
            return result

        # real name
        pinyin = PinYin(PINYIN)
        pinyin.load_word()
        name_pinyin_list = map(pinyin.hanzi2pinyin, self.name)
        result.extend(self._generator(name_pinyin_list, built_in.name_formats))

        # username
        result.extend(self._generator(self.username, built_in.general_formats))

        # email id string
        result.extend(self._generate_email())

        return list(set(result))
예제 #27
0
def main(args):

    test = PinYin()
    test.load_word()

    conn = getconn()
    cursor = conn.cursor()
    cursor.execute('select rname,rid from roominfo where py_name is null')
    #    cursor.execute('select cname,area from area_name_map where py_name is null')
    rows = cursor.fetchall()

    for row in rows:
        myword = row[0].encode("utf8")
        pylist = test.hanzi2pinyin(string=myword)
        pystr = pylist[0]
        for w in pylist[1:]:
            pystr = pystr + w[0]
#        cursor.execute('update area_name_map set py_name=? where cname=? and area=?',(pystr,row[0],row[1]))
        cursor.execute('update roominfo set py_name=? where rid=?',
                       (pystr, row[1]))
        conn.commit()
    conn.close()
예제 #28
0
def name_tran(str):
    test = PinYin()
    test.load_word()
    str[0]
    family = test.hanzi2pinyin(string=str[0])[0]
    last = u''
    print(str[1:])
    for word in test.hanzi2pinyin(string=str[1:]):
        last = last + word

    name_en = last.title() + u' ' + family.title()
    return name_en
예제 #29
0
    def Convert(self):
        py_engine = PinYin()
        py_engine.load_word()

        contact = list()
        f = open(self.filename,'r')
        for line in open(self.filename):  
            line = f.readline()
            k = re.findall(r"(\N\:[^\;]*\;[^\;]*\;[^\;]*\;[^\;]*\;)", line) 
            if k:
                if k[0].find(';') - 2 > 3:
                    xing = k[0][2: 5]
                    ming = k[0][5: k[0].find(';')] + k[0][k[0].find(';') + 1 : k[0].find(';', k[0].find(';')+1)]
                else:
                    xing = k[0][2: k[0].find(';') ]
                    ming = k[0][k[0].find(';') + 1 : k[0].find(';', k[0].find(';')+1)]
                contact.append('N:'+xing+';'+ming+';'+";;\n")

                phones = py_engine.hanzi2pinyin(string=xing)
                line = "X-PHONETIC-LAST-NAME:"
                for item in phones:
                    if item != '':
                        line = line + item.capitalize()
                line += "\n"
                contact.append(line)

                phones = py_engine.hanzi2pinyin(string=ming)
                line = "X-PHONETIC-FIRST-NAME:"
                for item in phones:
                    if item != '':
                        line = line + item.capitalize()
                line += "\n"
                contact.append(line)
            else:
                contact.append(line)

        fout = open("ok_"+self.filename, 'w')
        for line in contact:
            fout.write(line)
예제 #30
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-

from pinyin import PinYin

test = PinYin()
test.load_word()
string = "钓鱼岛是中国的"
print "out: %s" % test.hanzi2pinyin_split(string=string, split="-")
예제 #31
0
from common import get_response_by_url
from mongoservice import Insert,get_category_by_cid,get_by_pinyin,get_all
from bs4 import BeautifulSoup
from pinyin import PinYin
import  os
_cid = 160
base_url = "http://www.meishij.net/shiliao.php?cid="
s_pinyin = PinYin()
s_pinyin.load_word()

# filepath =os.path.abspath("./1.json")
'''获取 理疗分类'''
def get_meishijie_categories(cid,category_pinyin='',category_cn=''):
    url=base_url+str(cid)
    html =get_html_by_url(url)
    # print(html)
    # soup = BeautifulSoup(html)
    # print(soup)
    # print(soup.prettify())

    sop = BeautifulSoup(html)
    # h = sop.prettify()
    # print( h )
    # head = sop.find('head')
    # print(head)
    # p_categories = sop.findAll(attrs={'id':'listnav_ul'})[0]
    # print(p_categories)

    # dds = sop.select(".listnav_dl_style1 dd a")
    dds = sop.select(".listnav_dl_style1 .current a")
예제 #32
0
'my puzzle of PYthon'

print( [str(a) + str(b) for a in ["1","2"] for b in ["a", "b"] ] )
# 1a ....

from pinyin import PinYin

test = PinYin()
test.load_word()
a = test.hanzi2pinyin(string='钓鱼岛是中国的')
print(a)

# Python 3 renamed the unicode type to str
unicode_or_str = "钓鱼岛是中国的"

print()
# -*- coding: utf-8 -*-
# Author: [email protected]
# Copyright 2015 @ NLPJob

#bug fixed : can be use with pyenv environment
#update: this is Python3 script
#Please modify the pinyin.py of https://github.com/cleverdeng/pinyin.py to Python3 by frederic89

import codecs
import sys

from langconv import *
from pinyin import PinYin
py = PinYin()
py.load_word()

def make_word_4tag(word):
    if len(word) == 0:
        return "N"
    if len(word) == 1:
        return "S"
    else:
        tag = "B"
        for w in word[1:len(word)-1]:
            tag += "M"
        tag += "E"
        return tag

def make_mecab_seed_data(input_file, output_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
    output_data = codecs.open(output_file, 'w', 'utf-8')
예제 #34
0
파일: cn2pinyin.py 프로젝트: ddmkchan/Utils
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import re
import xlwt
from pinyin import PinYin, Cartesian_product

test = PinYin()
test.load_word()

wbk = xlwt.Workbook()


def main():
    #files = ['shop_2.csv', 'sight_2.csv']
    files = ['shop_2.csv', 'sight_2.csv', 'district_2_n.csv']
    for f in files:
        func(f)
    wbk.save("/home/chenyp/sharefolder/cn2pinyin.xls")


def func2(filename):
    #餐馆的输入文档
    poitype = filename.split(".")[0].decode('utf-8')
    column2 = u"%sid" % poitype
    count = 0
    lines = open(filename).readlines()[1:100]
    MAX = 35000
    if len(lines) % MAX == 0:
        total = len(lines) / MAX
    else:
예제 #35
0
#/usr/bin/env python
# coding=utf-8
import os
import sys
import numpy as np
import pandas as pd
import jieba
import re

sys.path.append('utils/')
import config
from pinyin import PinYin
str2pinyin = PinYin()
jieba.load_userdict(config.jieba_dict)
stopwords = [
    line.strip() for line in open(config.stopwords_path, 'r').readlines()
]
stopwords = [w.decode('utf8') for w in stopwords]
# stopwords=[]
#if config.cut_char_level:
stopwords = [
    u'?',
    u'。',
    u',',
]

use_pinyin = False


def clean_str(x):
    punc = "蚂蚁  了 吗  的 !?。,:;."
예제 #36
0
 def __init__(self):
     self.test = PinYin()
     self.test.load_word()
예제 #37
0
파일: import.py 프로젝트: simon1024/crm
//默认pwd为1q2w3e4r, md5
//默认mobile为11111111111
//默认role为普通员工, 8
//默认mail为[email protected]
//position和department进行关联

############################################################"""



import MySQLdb
import md5
from pinyin import PinYin

# init hanzi2pinyin tool
h2p = PinYin()
h2p.load_word()

# global varialbes
employee_file = "employees"
md5_prefix = 'psd_'
default_pwd = '1q2w3e4r'
default_mobile = '11111111111'
mobile = '11111111111'
role = 8
m = md5.new(md5_prefix + default_pwd)
m.digest()
pwd = m.hexdigest()

# config database
db_host='localhost'
예제 #38
0
    u"天气预报",
    u"京东",
    u"淘宝",
    u"百度",
    u"微信",
    u"斗鱼",
    u"爱奇艺",
    u"腾讯视频",
    u"qq",
    u"熊猫tv",
    u"快递",
    u'4399',
}


word2pinyin = PinYin()
word2pinyin.load_word()
alphabet = {'a':1, 'b':1, 'c':1, 'd':1, 'e':1, 'f':1, 'g':1,
            'h':1, 'i':1, 'j':1, 'k':1, 'l':1, 'm':1, 'n':1,
            'o':1, 'p':1, 'q':1, 'r':1, 's':1, 't':1,
            'u':1, 'v':1, 'w':1, 'x':1, 'y':1, 'z':1}


def hanzi2pinyi(word):
    result = []
    for hanzi in word:
        if hanzi.lower() in alphabet:
            result.append(hanzi.lower())
        else:
            result.append(word2pinyin.hanzi2pinyin(hanzi))
    return ''.join(result)
예제 #39
0
#!/usr/bin/env python
#!-*-coding:utf8-*
#£¡author: pyphrb
from pinyin import PinYin
f = open('createdict.txt', 'w')
test = PinYin()
test.load_word()
def listsThree(lists):
	array1 = lists[0]
	array2 = lists[1]
	array3 = lists[2]
	with open('dict.txt', 'r') as userDict:
		for i in userDict:
			i.strip('\r\n')
			f.write(array1 + i.strip() + '\n')
			f.write(array1 + array2 + array3[0:1] + i.strip() + '\n')
			f.write(array1 + array2[0:1] + array3[0:1] + i.strip() + '\n')
			f.write(array1[0:1] + array2[0:1] + array3[0:1] + i.strip() + '\n')
			f.write(array1.capitalize() + array2[0:1] + array3[0:1] + i.strip() + '\n')
			f.write(array1.capitalize()[0:1] + array2[0:1] + array3[0:1] + i.strip() + '\n')
	#f.close()
	print array1 + array2 + array3[0:1]
	print array1 + array2[0:1] + array3[0:1]
	print array1[0:1] + array2[0:1] + array3[0:1]
	print array1.capitalize() + array2[0:1] + array3[0:1]
	print array1.capitalize()[0:1] + array2[0:1] + array3[0:1]


def listsTwo(lists):
	array1 = lists[0]
	array2 = lists[1]
예제 #40
0
                host='192.100.2.31',
                user='******',
                passwd='opensesame',
                db='traincrawler',
                port=3306)
default_encoding = 'utf-8'
if sys.getdefaultencoding() != default_encoding:
    reload(sys)
    sys.setdefaultencoding(default_encoding)

base_url = "http://trains.ctrip.com/TrainBooking/Ajax/GetTrainDataV2.aspx?DepartureCity=%s&ArrivalCity=%s&DepartureDate=2017-03-30&NO=01"
# post_param = 'http://trains.ctrip.com/TrainBooking/Ajax/SearchListHandler.ashx?Action=getSearchList&value={"IsBus": False, "Filter": "0", "Catalog": "", "IsGaoTie":False, "IsDongChe":False, "CatalogName": "", "DepartureCity": %s, "ArrivalCity": %s, "HubCity": "", "DepartureCityName": %s, "ArrivalCityName": %s, "DepartureDate": "2017-03-24", "DepartureDateReturn": "2017-03-26", "ArrivalDate": "", "TrainNumber": ""}'
base_path = 'xc-price/%s'
getStations_sql = 'select id,begin_stop,begin_alia,end_stop,end_alia from train_stop_20170331_task_xc where task=0 limit 100'
update_sql = 'update train_stop_20170331_task_xc set task = 1 where id =%s'
py_util = PinYin()
py_util.load_word('word.data')


def get(p):
    time.sleep(1)
    content = ''
    try:
        p = p.encode('utf-8')
        response = urllib.urlopen(p)
        content = response.read()
        response.close()
        return content.decode('gb2312')
    except Exception as e:
        print e
        content = '500'
예제 #41
0
# -*- coding: utf-8 -*-
# from models import Cnword
from pinyin import PinYin
test = PinYin()
test.load_word()
print test.hanzi2pinyin(string='钓鱼岛是中国的')
print test.hanzi2pinyin_split(string='钓鱼岛是中国的')
예제 #42
0
파일: models.py 프로젝트: liutang123/odoo
# -*- coding: utf-8 -*-
import re

from openerp import models, fields, api
from openerp.osv.expression import get_unaccent_wrapper, NEGATIVE_TERM_OPERATORS

from pinyin import PinYin
han2py = PinYin()
han2py.load_word()
# class multiple_name_search(models.Model):
#     _name = 'multiple_name_search.multiple_name_search'

#     name = fields.Char()

# 将name转化为拼音的公共方法
def comman_change_name(name):
    pinyinStr, pyStr = False, False
    if name: #如果有name
        
        pinyinArr = han2py.str2pinyin(name)
        print pinyinArr
        pyStr = ''.join([p[0] for p in pinyinArr])
        pinyinStr = ''.join(pinyinArr)
    return {'pinyin': pinyinStr, 'py': pyStr}

class WithPinyinProductTemplate(models.Model):
    _inherit = 'product.template'
    
    pinyin = fields.Char(string='拼音', help='拼音英语表示,如“名称”的拼音是“mingcheng”') # , default=lambda self: comman_change_name(self.name)['pinyin']
    py = fields.Char(string='拼音首字母', index=True, help='拼音英语表示首字母,如“名称”的拼音是“mingcheng”,则它的拼音首字母则为“mc”') # , default=lambda self: comman_change_name(self.name)['py']
    
예제 #43
0
파일: models.py 프로젝트: liutang123/odoo
# -*- coding: utf-8 -*-
import re

from openerp import models, fields, api
from openerp.osv.expression import get_unaccent_wrapper, NEGATIVE_TERM_OPERATORS

from pinyin import PinYin
han2py = PinYin()
han2py.load_word()

# class multiple_name_search(models.Model):
#     _name = 'multiple_name_search.multiple_name_search'

#     name = fields.Char()


# 将name转化为拼音的公共方法
def comman_change_name(name):
    pinyinStr, pyStr = False, False
    if name:  #如果有name

        pinyinArr = han2py.str2pinyin(name)
        print pinyinArr
        pyStr = ''.join([p[0] for p in pinyinArr])
        pinyinStr = ''.join(pinyinArr)
    return {'pinyin': pinyinStr, 'py': pyStr}


class WithPinyinProductTemplate(models.Model):
    _inherit = 'product.template'
예제 #44
0
class CDBLPAuthor:

    pinyin = PinYin()
    pinyin.load_word()

    def __init__(self, author_name, link=''):

        self.author_name = CDBLPAuthor.getEnglishName(author_name)

        if not link:
            link = 'http://cdblp.cn/search_result.php?author_name={}&area=computer'.format(
                quote(self.author_name['zh']))
        elif author_name == '王伟':
            link = 'http://127.0.0.1/ww'

        self.res = urlopen(link)
        self.dom = BeautifulSoup(self.res)

        #self.get_all_authors()

        self.author = {
            'author_name': {},
            'coauthors': [],
            'publications': [{
                'title':
                'Ranking the Difficulty Level of the Knowledge Units Based on Learning Dependency',
                'authors':
                ['Jun Liu', 'Sha Sha', 'Qinghua Zheng', 'Wei Zhang'],
                'venue-type':
                'journal',
                'venue':
                'IJDET',
                'volume':
                '',
                'number':
                '',
                'pages':
                '',
                'year':
                '2012',
                'cdblpkey':
                '83594'
            }]
        }

    def get_all_authors(self):

        l = []

        all_name_tags = self.dom.find_all(
            href=re.compile('namedisambiguation'))

        i = 0
        for name_tag in all_name_tags:
            if name_tag.string != 'Unknown':
                print(i, self.author_name['zh'], 'from', name_tag.string)
                l.append('http://cdblp.cn' + name_tag['href'][5:])
            i += 1

        c = int(
            input(
                'There are several authors under this name, which one do you want to choose?\n> '
            ))
        if c < 0:
            c = 0

        self.res = urlopen(l[c])
        self.dom = BeautifulSoup(self.res)

        return l[c]

    def get_author(self):

        coauthors = self.get_coauthors()
        publications = []

        paper_link_tags = self.dom.find_all(href=re.compile('^/paper'))

        for paper_link_tag in paper_link_tags:
            # table cell tag
            td_tag = paper_link_tag.parent
            # title
            title = paper_link_tag.string
            link = paper_link_tag['href']
            cdblbkey = re.findall('(\d+)(\.html$)', link)[0][0]
            # authors
            authors = []
            counter = 0
            for author_tag in td_tag.find_all(href=re.compile('^/author')):
                if counter == 0:
                    current_author = author_tag.previous_sibling
                    if type(current_author
                            ) == NavigableString and self.author_name[
                                'zh'] in current_author.string:
                        authors.append(current_author.string.strip())

                if isinstance(author_tag.string, str):
                    authors.append(author_tag.string.strip())

                current_author = author_tag.next_sibling
                if type(current_author
                        ) == NavigableString and self.author_name[
                            'zh'] in current_author.string:
                    authors.append(
                        current_author.string.replace('.', '').strip())

                counter += 1

            # publication data
            venue_rec = td_tag.find_all(href=re.compile('^/journal'))
            venue = venue_rec[0].string
            volume_result = re.compile('(/journal)/(.*)/(\d*)/(.*)').findall(
                venue_rec[1]['href'])[0]
            issue_result = re.compile(
                '(/journal_issue)/(.*)/(\d*)/(.*)').findall(
                    venue_rec[2]['href'])[0]
            year = volume_result[-2]
            volume = volume_result[-1]
            number = issue_result[-1]
            pages = venue_rec[-1].next_sibling.string.replace(':', '').strip()

            publication = {
                'title': title,
                'authors': authors,
                'venue-type': 'journal',
                'venue': venue,
                'volume': unquote(volume),
                'number': unquote(number),
                'pages': pages,
                'year': year,
                'cdblpkey': cdblbkey
            }

            publications.append(publication)

        self.author = {
            'author_name': self.author_name,
            'coauthors': coauthors,
            'publications': publications
        }

        return self.author

    def get_coauthors(self):
        coauthors = []

        coauthor_table = self.dom.find_all('table')[-2]
        coauthor_tags = coauthor_table.find_all(href=re.compile('^/author'))
        for coauthor_tag in coauthor_tags:
            coauthored_pub_tags = coauthor_tag.parent.find_next_sibling(
                'td').find_all('a')
            author = CDBLPAuthor.getEnglishName(coauthor_tag.string.strip())
            author['count'] = len(coauthored_pub_tags)
            author['pubs'] = map(lambda t: t['href'][1:], coauthored_pub_tags)
            coauthors.append(author)

        return coauthors
        #return list(map(lambda a: '{} {}'.format(a['first_name'], a['last_name']), self.coauthors_en))

    @staticmethod
    def getEnglishName(author_name_zh):

        author_name_en_split = CDBLPAuthor.pinyin.hanzi2pinyin(
            author_name_zh.strip())
        # return author's English name
        if isinstance(author_name_en_split, str):
            author_name = {'full_name': author_name_en_split}

        else:
            if len(author_name_zh) > 1:
                author_name = {
                    'zh':
                    author_name_zh,
                    'last_name':
                    author_name_en_split[0].capitalize(),
                    'first_name':
                    author_name_en_split[1].capitalize() +
                    ''.join(author_name_en_split[2:])
                }
                author_name['full_name'] = '{} {}'.format(
                    author_name['first_name'], author_name['last_name'])
                author_name['full_name_reverse'] = '{} {}'.format(
                    author_name['last_name'], author_name['first_name'])
                if len(author_name_zh) == 3:
                    author_name['full_name_dash'] = '{}-{} {}'.format(
                        author_name_en_split[1].capitalize(),
                        author_name_en_split[2], author_name['last_name'])
            else:
                author_name = {
                    'zh': author_name_zh,
                    'last_name': author_name_en_split[0].capitalize(),
                    'first_name': ''
                }
                author_name['full_name'] = '{} {}'.format(
                    author_name['first_name'], author_name['last_name'])
                author_name['full_name_reverse'] = '{} {}'.format(
                    author_name['last_name'], author_name['first_name'])

        return author_name

    @staticmethod
    def get_english_name(author_name_zh, py_obj):

        author_name_en_split = py_obj.hanzi2pinyin(author_name_zh.strip())
        # return author's English name
        if isinstance(author_name_en_split, str):
            author_name = {'full_name': author_name_en_split}

        else:
            if len(author_name_zh) > 1:
                author_name = {
                    'zh':
                    author_name_zh,
                    'last_name':
                    author_name_en_split[0].capitalize(),
                    'first_name':
                    author_name_en_split[1].capitalize() +
                    ''.join(author_name_en_split[2:])
                }
                author_name['full_name'] = '{} {}'.format(
                    author_name['first_name'], author_name['last_name'])
                author_name['full_name_reverse'] = '{} {}'.format(
                    author_name['last_name'], author_name['first_name'])
                if len(author_name_zh) == 3:
                    author_name['full_name_dash'] = '{}-{} {}'.format(
                        author_name_en_split[1].capitalize(),
                        author_name_en_split[2], author_name['last_name'])
            else:
                author_name = {
                    'zh': author_name_zh,
                    'last_name': author_name_en_split[0].capitalize(),
                    'first_name': ''
                }
                author_name['full_name'] = '{} {}'.format(
                    author_name['first_name'], author_name['last_name'])
                author_name['full_name_reverse'] = '{} {}'.format(
                    author_name['last_name'], author_name['first_name'])

        return author_name

    @staticmethod
    def get_publications_by_journal(journal, year, issue):
        res = urlopen('http://cdblp.cn/journal_issue/' +
                      quote('{}/{}/{}'.format(journal, year, issue)))
        dom = BeautifulSoup(res)
        publications = []

        paper_link_tags = dom.find_all(href=re.compile('^/paper'))
        for paper_link_tag in paper_link_tags:
            # table cell tag
            td_tag = paper_link_tag.parent
            # title
            title = paper_link_tag.string
            link = paper_link_tag['href']
            cdblbkey = re.findall('(\d+)(\.html$)', link)[0][0]
            # authors
            authors = []
            for author_tag in td_tag.find_all(href=re.compile('^/author')):
                author_name = author_tag.contents[0]
                if isinstance(author_name, str):
                    authors.append(author_name.strip())

            # publication data
            venue_rec = td_tag.find_all(href=re.compile('^/journal'))
            venue = venue_rec[0].string
            volume_result = re.compile('(/journal)/(.*)/(\d*)/(.*)').findall(
                venue_rec[1]['href'])[0]
            issue_result = re.compile(
                '(/journal_issue)/(.*)/(\d*)/(.*)').findall(
                    venue_rec[2]['href'])[0]
            year = volume_result[-2]
            volume = volume_result[-1]
            number = issue_result[-1]
            pages = venue_rec[-1].next_sibling.string.replace(':', '').strip()

            publication = {
                'title': title,
                'authors': authors,
                'venue-type': 'journal',
                'venue': venue,
                'volume': unquote(volume),
                'number': unquote(number),
                'pages': pages,
                'year': year,
                'cdblpkey': cdblbkey
            }

            publications.append(publication)

        return publications

    @staticmethod
    def get_publication_dict():

        publication_dict = {}

        res = urlopen('http://cdblp.cn/jour_scan.php?fid=journalscan')
        category_dom = BeautifulSoup(res)

        print(
            list(
                map(
                    lambda c: {
                        'title': c.string,
                        'href': 'http://cdblp.cn' + c['href']
                    }, category_dom.find_all(href=re.compile('^/journal')))))

        for journal_tag in category_dom.find_all(href=re.compile('^/journal')):

            journal = journal_tag.string
            print(journal)
            print('http://cdblp.cn' + journal_tag['href'])
            publication_dict[journal] = {}

            res = urlopen('http://cdblp.cn' + journal_tag['href'])
            journal_dom = BeautifulSoup(res)

            for issue_tag in journal_dom.find_all(
                    href=re.compile('^/journal_issue')):
                print(issue_tag.string)
                print(issue_tag['href'])

                issue_result = re.compile(
                    '(/journal_issue)/(.*)/(\d*)/(.*)').findall(
                        issue_tag['href'])[0]
                year = issue_result[-2]
                issue = unquote(issue_result[-1])
                publications = CDBLPAuthor.get_publications_by_journal(
                    journal, year, issue)

                if not publication_dict[journal].__contains__(year):
                    publication_dict[journal][year] = {}

                publication_dict[journal][year][issue] = publications

        return publication_dict

    @staticmethod
    def parallel_get(journal, link):

        publication_dict = {}

        print(journal)
        print(link)

        res = urlopen(link)
        journal_dom = BeautifulSoup(res)

        for issue_tag in journal_dom.find_all(
                href=re.compile('^/journal_issue')):
            #print(issue_tag.string)
            #print(issue_tag['href'])
            try:
                issue_result = re.compile(
                    '(/journal_issue)/(.*)/(\d*)/(.*)').findall(
                        issue_tag['href'])[0]
                year = issue_result[-2]
                issue = unquote(issue_result[-1])
                publications = CDBLPAuthor.get_publications_by_journal(
                    journal, year, issue)

                if not publication_dict.__contains__(year):
                    publication_dict[year] = {}

                publication_dict[year][issue] = publications
            except AttributeError as e:
                print(journal + year + issue)
                print(e)
            except TypeError as et:
                print(journal + year + issue)
                print(et)
            except urllib.error.HTTPError as eh:
                print(journal + year + issue)
                print(eh)

        cache = open('{}-pub-cache.data'.format(journal), 'w')
        cache.write(json.dumps(publication_dict))
        cache.close()

        return publication_dict
예제 #45
0
# -*- coding: utf-8 -*-
# Author: [email protected]
# Copyright 2015 @ NLPJob

#bug fixed : can be use with pyenv environment
#update: this is Python3 script

import codecs
import sys

from langconv import *
from pinyin import PinYin
py = PinYin()
py.load_word()


def make_word_4tag(word):
    if len(word) == 0:
        return "N"
    if len(word) == 1:
        return "S"
    else:
        tag = "B"
        for w in word[1:len(word) - 1]:
            tag += "M"
        tag += "E"
        return tag


def make_mecab_train_data(input_file, output_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
예제 #46
0
def check_contain_english(check_str):
    for ch in check_str.decode('utf-8'):
        if ch <= u'\u4e00' or ch >= u'\u9fff':
            return True
    return False


# import city name into pandas Dataframe
with open('ChinaCityList.json') as json_data:
    d = json.load(json_data)
# extract city name into list
city = json_normalize(data=d, record_path=['city', 'county'])
city_name = city.name.tolist()
city_name = [x.encode('utf-8') for x in city_name]
# Build a dictionary in the form of city:pinying
trans = PinYin()
trans.load_word()
to_py = trans.hanzi2pinyin
city_py = [to_py(x) for x in city_name]
city_dict = dict(zip(city_name, city_py))


# city chain game
def city_chain(city):
    if len(city) == 0:
        print '错误:请确认是否输入汉字'
    elif check_contain_english(city):
        print '错误:请确认是否输入了非汉字'
    else:
        candidate = []
        py_city = to_py(city)
예제 #47
0
class correction:
    
    def __init__(self):
        self.pp=PinYin()
        self.pp.load_word()
        with open('pinyin_dict','r') as ff:
            line=ff.readline()
            self.jj_dict=json.loads(line)
            ff.close()


    def correct(self,phrase_list):
        termlist=[]
        flag=False

        newplist=self.recompose(phrase_list)
        '''for item in newplist:
            for item2 in item:
                print(item2.encode('utf-8'))'''
        for nnlist in newplist:
            i=0
            tmp_correct=[]
            correct_num=[]
            for item in nnlist:
                py=self.pp.hanzi2pinyin_split(item,'_')
                tmp=[]
                tmp_correct.append(tmp)
                if(py in self.jj_dict):
                    for item2 in self.jj_dict[py]:
                        tmp_correct[i].append(item2)
                else:
                    tmp_correct[i].append((item,1))
                correct_num.append(0)
                i+=1
            
            length=len(tmp_correct)
            notend=True
            while notend:
                i=0
                tmpstr=''
                score=0
                for j in xrange(0,length):
                    tmps=tmp_correct[j][correct_num[j]][0]
                    tmpstr+=tmps
                    score+=int(tmp_correct[j][correct_num[j]][1])*len(tmps)**7
                termlist.append(tup(tmpstr,score))
                correct_num[0]+=1
                while correct_num[i]>=len(tmp_correct[i]):
                    correct_num[i]=0
                    if i<length-1:
                        correct_num[i+1]+=1
                        i+=1
                    else:
                        notend=False
        
        result_list=self.sscore(termlist)
        comstr=''
        for item in phrase_list:
            comstr+=item
        if result_list[0]==comstr:
            result_list=[]
        else:
            if comstr in result_list:
                result_list.pop(result_list.index(comstr))
        return result_list


    def sscore(self,termlist):
        heap=[]
        result_list=[]
        for item in termlist:
            heap.append(item)
        heapq.heapify(heap)
        while len(heap) > 5:
            
            a=heapq.heappop(heap)
        while len(heap) > 0:
            result_list.append(heapq.heappop(heap).term)
        
        result_list.reverse()
        return result_list

    def recompose(self,phrase_list):
        position=[]
        attach={}
        cpl=[]    #the consequence:list of list 
        i=0       #position of single word
        for item in phrase_list:
            if(len(item)==1):
                position.append(i)
                attach[i]=0
            i+=1

        notend=True
        length=len(position)
        if length>0:
            while notend:
                gap=0
                tmp_list=copy.deepcopy(phrase_list)
                tmp_position=copy.deepcopy(position)
                pi=0
                while pi < len(tmp_position):
                    item2=tmp_position[pi]
                    if(attach[item2]==0):
                        if item2-1-gap>=0:
                            tmp_list[item2-1-gap]+=tmp_list[item2-gap]
                            k=tmp_position.index(item2)
                            tmp_position.pop(k)
                            tmp_list.pop(item2-gap)
                            gap+=1
                        else:
                            pi+=1
                            '''while k < len(tmp_position):
                                tmp_position[k]-=gap
#print(tmp_position[k])
                                attach[tmp_position[k]]=attach[tmp_position[k]+gap]
                                k+=1'''
                    else:
                        if attach[item2]==1:
                            if item2+1-gap<len(tmp_list):
                                tmp_list[item2+1-gap]=tmp_list[item2-gap]+tmp_list[item2+1-gap]
                                k=tmp_position.index(item2)
                                tmp_position.pop(k)
                                tmp_list.pop(item2-gap)
                                gap+=1
                                if item2+1 in tmp_position:
                                    tmp_position.pop(tmp_position.index(item2+1))
                            else:
                                pi+=1
                        else:
                             pi+=1
                        '''while k < len(tmp_position):
                            tmp_position[k]-=gap
                            attach[tmp_position[k]]=attach[tmp_position[k]+gap]
                            k+=1'''
                '''flag=True
                for item3 in tmp_list:
                    if len(item3)==1:
                        flag=False
                if flag:'''
                if tmp_list not in cpl:
                    cpl.append(tmp_list)

                attach[position[0]]+=1 #每次变换一个
                i=0
                while attach[position[i]]>=3:
                        attach[position[i]]=0
                        if i<length-1:
                            attach[position[i+1]]+=1
                            i+=1
                        else:
                            notend=False
        else:
            cpl.append(phrase_list)
        return cpl
예제 #48
0
파일: get_phonetic.py 프로젝트: AmoCat/smp
def get_phonetic(word):
    pinyin = PinYin()
    pinyin.load_word()
    return ''.join(pinyin.hanzipinyin(word))
예제 #49
0
#!/usr/bin/env python
#!-*-coding:utf8-*
#£¡author: pyphrb
from pinyin import PinYin
f = open('createdict.txt', 'w')
test = PinYin()
test.load_word()


def listsThree(lists):
    array1 = lists[0]
    array2 = lists[1]
    array3 = lists[2]
    with open('dict.txt', 'r') as userDict:
        for i in userDict:
            i.strip('\r\n')
            f.write(array1 + i.strip() + '\n')
            f.write(array1 + array2 + array3[0:1] + i.strip() + '\n')
            f.write(array1 + array2[0:1] + array3[0:1] + i.strip() + '\n')
            f.write(array1[0:1] + array2[0:1] + array3[0:1] + i.strip() + '\n')
            f.write(array1.capitalize() + array2[0:1] + array3[0:1] +
                    i.strip() + '\n')
            f.write(array1.capitalize()[0:1] + array2[0:1] + array3[0:1] +
                    i.strip() + '\n')
    #f.close()
    print array1 + array2 + array3[0:1]
    print array1 + array2[0:1] + array3[0:1]
    print array1[0:1] + array2[0:1] + array3[0:1]
    print array1.capitalize() + array2[0:1] + array3[0:1]
    print array1.capitalize()[0:1] + array2[0:1] + array3[0:1]
예제 #50
0
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from pinyin import PinYin
import sys

if __name__ == "__main__":
  test = PinYin()
  test.load_word()
  # string = "测试文本"
  f = open(sys.argv[1],'r')
  f2 = open("dict/"+"short_"+sys.argv[1].split("/")[-1],'w')
  f3 = open("dict/"+"full_"+sys.argv[1].split("/")[-1],'w')
  for i in f:
    # ch_str = i.split(" ")[1].strip()
    ch_str = i.strip()
    print "\""+ch_str+"\""
    short_arr = test.hanzi2pinyin(string=ch_str)
    short_str = ""
    for x in short_arr:
      try:
        print x
        short_str += x[0]
      except:
        continue

    print short_str
    f2.write(short_str+" "+ch_str+"\n")
    f3.write("".join(short_arr)+" "+ch_str+"\n")

  f2.close()
  f3.close()
예제 #51
0
 def __init__(self):
     self.test=PinYin()
     self.test.load_word()
예제 #52
0
파일: cn2pinyin.py 프로젝트: ddmkchan/Utils
#!/usr/bin/env python
# -*- coding:utf-8 -*-

import re
import xlwt
from pinyin import PinYin, Cartesian_product

test = PinYin()
test.load_word()

wbk = xlwt.Workbook()

def main():
    #files = ['shop_2.csv', 'sight_2.csv']
    files = ['shop_2.csv', 'sight_2.csv', 'district_2_n.csv']
    for f in files:
        func(f)
    wbk.save("/home/chenyp/sharefolder/cn2pinyin.xls")

def func2(filename):
    #餐馆的输入文档
    poitype = filename.split(".")[0].decode('utf-8')
    column2 = u"%sid" % poitype
    count = 0
    lines = open(filename).readlines()[1:100]
    MAX = 35000
    if len(lines) % MAX == 0:
        total = len(lines) / MAX
    else:
        total = int(len(lines) / 35000.0 + 1)
    sheets = [wbk.add_sheet("%s_%s" % (poitype, i)) for i in xrange(total)]