def article_cat_save(): form = Article_cat(request.form) p = Pinyin() if form.validate(): cat_id = request.form.get('cat_id') parent_id = request.form.get('parent_id') cat_name = request.form.get('cat_name') dir = request.form.get('dir') check = request.form.get('check') if check: dir = request.form.get('cat_name') dir = p.get_pinyin(dir, '') else: if dir: dir = request.form.get('dir') else: dir = request.form.get('cat_name') dir = p.get_pinyin(dir, '') keywords = request.form.get('keywords') description = request.form.get('description') cat_sort = request.form.get('cat_sort') status = request.form.get('status') Articles_Cat.query.filter(Articles_Cat.cat_id == cat_id).update({Articles_Cat.parent_id:parent_id, Articles_Cat.cat_name:cat_name, Articles_Cat.dir:dir,\ Articles_Cat.keywords:keywords, Articles_Cat.description:description,Articles_Cat.cat_sort:cat_sort,\ Articles_Cat.status:status}) db.session.commit() return redirect(url_for('admin.article_cat_list'))
def generate_hostname(self): pinyin = Pinyin() instanceName = Instance.objects.filter(SerialNumber=self.serialnumber) info = instanceName[0].InstanceName innerIP = instanceName[0].InnerIpAddress[0] pubIP = instanceName[0].PublicIpAddress[0] status = instanceName[0].Status instanceid = instanceName[0].InstanceId if not info: raise ValueError else: info = instanceName[0].InstanceName.split("-") hostname_last = Server.objects.filter( hostname__regex = r'^%s[0-9]{3}.meetyima.com$' %(pinyin.get_pinyin(info[0],'') + '-' + info[1] + '-' + info[2] + '-') ).order_by('hostname').last() #如果查到有同名主机名,id加1,否则id为001 if hostname_last: hostname_last_id = str(int(hostname_last.hostname.split("-")[3].split(".")[0]) + 1).zfill(3) else: hostname_last_id = '001' hostname = pinyin.get_pinyin(info[0], '') + '-' + info[1] + '-' + info[2] + '-' + hostname_last_id + '.' + 'meetyima.com' return hostname,innerIP,pubIP,status,instanceid
def get_candidates(error_phrase): candidates_1st_order = [] candidates_2nd_order = [] candidates_3nd_order = [] p = Pinyin() error_pinyin = p.get_pinyin(error_phrase) re.sub("-", "/", error_pinyin) cn_words_dict = load_cn_words_dict( "HW10/Autochecker4Chinese-master/cn_dict.txt") candidate_phrases = list(known(edits1(error_phrase, cn_words_dict))) for candidate_phrase in candidate_phrases: # candidate_pinyin = pinyin.get(candidate_phrase, format="strip", delimiter="/").encode("utf-8") candidate_pinyin = p.get_pinyin(candidate_phrase) re.sub("-", "/", candidate_pinyin) if candidate_pinyin == error_pinyin: candidates_1st_order.append(candidate_phrase) elif candidate_pinyin.split("/")[0] == error_pinyin.split("/")[0]: candidates_2nd_order.append(candidate_phrase) else: candidates_3nd_order.append(candidate_phrase) return candidates_1st_order, candidates_2nd_order, candidates_3nd_order
def decode(self,area): p = Pinyin() print("\n\n\n\n\n zone_level_area :"+area) for province in self.zonelevel: children=province["children"] for city in children: value=city["value"] if value in area: return {"province":province["value"],"city":value,"area":"","level":"3"} for province in self.zonelevel: value=province["value"] if value in area: return {"province":value,"city":"","area":"","level":"2"} if "中华人民共和国" in area: return {"province":"","city":"","area":"","level":"1"} for province in self.zonelevel: children=province["children"] province_name=province["value"] for city in children: city_name=city["value"] city_name_py="_"+p.get_pinyin(province_name,'')+"_"+p.get_pinyin(city_name,'') print("city_name_py:"+city_name_py.lower()+" "+ area.lower()) if city_name_py.lower() in area.lower(): area_name=area.split("(")[0] return {"province":province_name,"city":city_name,"area":area_name,"level":"4"} return {"province":"","city":"","area":"","level":"4"}
def welcome(): options = parse_options() p = Pinyin() if options.name: username = options.name username = unicode(username) second_name = username[1:] pinyin = p.get_pinyin(username, " ").split() pinyin_abbrev = pinyin[0] + "".join(item[0] for item in pinyin[1:]) message_format = u"{0}同学好,大家好,\n最近我们组迎来了{0}同学,欢迎他们加入我们温暖的大家庭!\n{1}同学的组内邮箱为: {2}@nlp.nju.edu.cn\n大家多联系,多关照。\n如果有任何问题,请联系管理员程善伯[email protected]或郁振庭[email protected]。\n另:组内为每位同学分配了一定资源,附件中为组内资源介绍。\n\n祝好!\n\n谢谢\n振庭" message = message_format.format(username, second_name, pinyin_abbrev) print message elif options.file: message_format = "大家好,\n最近我们组迎来了:\n{0}等{1}位同学同学。\n欢迎他们加入我们温暖的大家庭!\n大家多联系,多关照。\n如果有任何问题,请联系管理员\n另:组内为每位同学分配了一定资源,附件中为组内资源介绍以及组内编程规范,\n请仔细阅读。\n\n祝好!\n\n谢谢\n振庭" with codecs.open(options.file, encoding="utf-8") as infile: names = [line.strip() for line in infile] num_students = len(names) pinyins = [p.get_pinyin(name, " ").split() for name in names] pinyin_abbrevs = [pinyin[0] + "".join(item[0] for item in pinyin[1:]) for pinyin in pinyins] email_format = u"{0}({1}@nlp.nju.edu.cn)" emails = [email_format.format(name, pinyin_abbrev) for name, pinyin_abbrev in izip(names, pinyin_abbrevs)] message = message_format.format("\n".join(emails), num_students) print message else: print >> sys.stderr, "Please check your options"
def get_matrix_pinyin(pos_path="data/samples/positive.txt", neg_path="data/samples/negative.txt"): from xpinyin import Pinyin dataset = [] pin = Pinyin() with open(pos_path, encoding='utf8') as f: dataset += [ pin.get_pinyin(line, '').split() for line in f if line != '\n' ] pos_len = len(dataset) print("positive matrix length", pos_len) with open(neg_path, encoding='utf8') as f: dataset += [ pin.get_pinyin(line, '').split() for line in f if line != '\n' ] neg_len = len(dataset) - pos_len print("negative matrix length", neg_len) dct = Dictionary(dataset) print("dictionary length", len(dct)) corpus = [dct.doc2bow(line) for line in dataset] model = TfidfModel(corpus) pos_matrix = np.zeros((pos_len, len(dct))) neg_matrix = np.zeros((neg_len, len(dct))) for i, line in enumerate(model[corpus][:pos_len]): for j, n in line: pos_matrix[i, j] = n for i, line in enumerate(model[corpus][pos_len:]): for j, n in line: neg_matrix[i, j] = n print("get matrix completed") return pos_matrix, neg_matrix
def xiaoqu_write_into_excel(self): regions = [ '浦东', '闵行', '宝山', '徐汇', '普陀', '杨浦', '长宁', '松江', '嘉定', '黄浦', '静安', '闸北', '虹口', '青浦', '奉贤', '金山', '崇明', '上海周边' ] #regions = ['闵行','虹口','浦东'] #测试 p = Pinyin() filename = '上海链家小区数据.xlsx' sheet_number = 0 wb = Workbook() ws = [] for region_name in regions: print(region_name) ws.append(wb.create_sheet(region_name)) if region_name == '闵行': region_name = 'minhang' else: region_name = p.get_pinyin(region_name, '') #汉字转为拼音以符合链接要求 region_name = p.get_pinyin(region_name, '') urls = xiaoqu.url(region_name) xiaoqu_list = xiaoqu.xiaoqu_data(urls) ws[sheet_number].append( ['小区名称', '挂牌均价(元/平)', '在售数量(套)', '区域', '位置', '地铁', '建造时间']) for xq in xiaoqu_list: print(xq) ws[sheet_number].append([ xq[0], toNumber(xq[1]), toNumber(xq[2]), xq[3], xq[4], xq[5], xq[6] ]) sheet_number += 1 wb.save(filename=filename) print('Took {}s'.format(time() - ts))
class PinyinTests(unittest.TestCase): def setUp(self): import cpinyin cpinyin.install() from xpinyin import Pinyin self.p = Pinyin() def test_get_pinyin_with_default_splitter(self): self.assertEqual(self.p.get_pinyin(u'上海'), u'shang-hai') def test_get_pinyin_with_splitter(self): self.assertEqual(self.p.get_pinyin(u'上海', splitter=u''), u'shanghai') def test_get_pinyin_mixed_words(self): self.assertEqual(self.p.get_pinyin(u'Apple发布iOS7', splitter=u'-'), u'Apple-fa-bu-iOS7') def test_get_pinyin_with_tone_marks(self): self.assertEqual(self.p.get_pinyin(u'上海', tone_marks=u'marks'), u'sh\xe0ng-h\u01cei') def test_get_pinyin_with_tone_marks(self): self.assertEqual(self.p.get_pinyin(u'秋', tone_marks=u'marks'), u'qiū') def test_get_initial(self): self.assertEqual(self.p.get_initial(u'你'), u'N') def test_get_initials(self): self.assertEqual(self.p.get_initials(u'你好'), u'N-H') def test_get_initials_with_splitter(self): self.assertEqual(self.p.get_initials(u'你好', u' '), u'N H') self.assertEqual(self.p.get_initials(u'你好', u''), u'NH')
def process_east_asian_dataset(): data = {} p = Pinyin() female_chinese, male_chinese = [], [] with open('data/female_c.txt') as f: for line in f: line = line.replace('\n', '').replace('\t', ' ') vals = line.split(' ') cleaned_vals = [i for i in vals if i] female_chinese += cleaned_vals converted_female_chinese = [p.get_pinyin(i).replace('、', ' ').replace('-', '').replace('\n', '') for i in female_chinese] for i in converted_female_chinese: if i not in data.keys(): data[i] = 'F' if i + i not in data.keys(): data[i + i] = 'F' with open('data/male_c.txt') as f: for line in f: line = line.replace('\n', '').replace('\t', ' ') vals = line.split(' ') cleaned_vals = [i for i in vals if i] male_chinese += cleaned_vals converted_male_chinese = [p.get_pinyin(i).replace('、', ' ').replace('-', '').replace('\n', '') for i in male_chinese] for i in converted_male_chinese: if i not in data.keys(): data[i] = 'M' if i + i not in data.keys(): data[i + i] = 'M' return data
def handle(self, *args, **options): pin = Pinyin() MONGO_URI = 'mongodb://localhost:27017' MONGO_DB = 'chinese_food' mongo_client = pymongo.MongoClient(MONGO_URI) db = mongo_client[MONGO_DB] categories = set() for food in db.food.find({}): categories.add(food['category']) categories = list(categories) # 虚构了几个餐厅 restaurants = ['阿坤私房菜', '橘子餐厅', '北欧时光·清真', '辣一天川小館', '川人百味'] categories = [Category(name=cat, slug=pin.get_pinyin(cat)) for cat in categories] restaurants = [Restaurant(name=res) for res in restaurants] for category in categories: category.save() for res in restaurants: res.save() for category in categories: for food in db.food.find({'category': category.name}): product = Product.objects.create( category=category, name=food['name'], slug=slugify(pin.get_pinyin(food['name'])), image='/static/food_images/{}/{}.jpg'.format( category.name, food['name'] ), kouwei=food['kouwei'], gongyi=food['gongyi'], restaurant=random.choice(restaurants), description=food['intro'] or 'no description.', price=random.randint(10, 100)) product.save()
def pinyin(target, answer): p = Pinyin() dict_num2str = { '1': '一', '2': '俩', '3': '三', '4': '四', '5': '五', '6': '六', '7': '七', '8': '八', '9': '九' } answer_new = '' #数字转换成中文 for a_i in answer: if a_i in dict_num2str.keys(): answer_new = answer_new + dict_num2str[a_i] else: answer_new = answer_new + a_i #中文转换成拼音 answer_new = p.get_pinyin(target, '') target_new = p.get_pinyin(answer_new, '') if target_new == answer_new: print(True) else: print(False)
def inTitle(name): p = Pinyin() if len(name) > 0: if name[:2] in D: last = name[:2] first = name[2:] L = p.get_pinyin(last, '') F = p.get_pinyin(first, '') else: last = name[:1] first = name[1:] L = p.get_pinyin(last, '') F = p.get_pinyin(first, '') titleText = [ '"' + first + last + '"', '"' + last + first + '"', '"' + L + " " + F + '"', '"' + F + " " + L + '"' ] return titleText else: return None
def convert_to_pinyin(name): name = tradition2simple(name) py = Pinyin() pinyin = ' '.join( [string.capitalize(py.get_pinyin(name[1:], '')), string.capitalize(py.get_pinyin(name[0], ''))] ) return pinyin
def name_submit(request): ip = log_visitor_ip(request) request_data = {} if request.method == 'GET': request_data = request.GET elif request.method == 'POST': request_data = request.POST response_data = {} global GENDER_PREDICTOR, CHINESE_GENDER_PREDICTOR, COUNTRY_PREDICTOR if not CHINESE_GENDER_PREDICTOR: CHINESE_GENDER_PREDICTOR = load_gender_predict_model( CHINESE_GENDER_MODEL_PATH) if not GENDER_PREDICTOR: GENDER_PREDICTOR = load_gender_predict_model(GENDER_MODEL_PATH) if not COUNTRY_PREDICTOR: COUNTRY_PREDICTOR = load_country_predict_model() target_name = request_data['name'] if not target_name: return JsonResponse({'gender': 'NONAME'}) try: name_info = NameInfo(name=request_data['name'], gender=request_data.get('gender', None), country=request_data.get('country', None), time=datetime.datetime.now(), ip=ip) name_info.save() except: pass is_chinese = any(u'\u4e00' <= c <= u'\u9fff' for c in target_name) if is_chinese: py = Pinyin() target_name = ' '.join([ string.capitalize(py.get_pinyin(target_name[1:], '')), string.capitalize(py.get_pinyin(target_name[0], '')) ]) if type(target_name) is unicode: target_name = target_name.encode('utf-8') ### Country Prediction country = COUNTRY_PREDICTOR.predict(target_name) response_data['country'] = country.capitalize() if country == 'china': is_chinese = True ### Gender Prediction if is_chinese: is_male = CHINESE_GENDER_PREDICTOR.predict(target_name) else: is_male = GENDER_PREDICTOR.predict(target_name) if is_male: response_data['gender'] = 'MALE' else: response_data['gender'] = 'FEMALE' return JsonResponse(response_data)
def article_cat_edit(id): if request.method == 'GET': cat_list = Articles_Cat.query.filter_by(cat_id=id).first() categorys = Articles_Cat.query.all() # 取得所有分类 list = [] data = {} for cat in categorys: data = dict(cat_id=cat.cat_id, parent_id=cat.parent_id, cat_name=cat.cat_name) list.append(data) data = build_tree(list, 0, 0) html = build_table(data, parent_title='顶级菜单') return render_template('admin/articel_cat_edit.html', content=cat_list, message=html) else: form = Article_cat(request.form) p = Pinyin() if form.validate(): parent_id = request.form.get('parent_id') print(parent_id) cat_id = int(request.form.get('cat_id')) cat_name = request.form.get('cat_name') dir = request.form.get('dir') check = request.form.get('check') if check: dir = request.form.get('cat_name') dir = p.get_pinyin(dir, '') else: if dir: dir = request.form.get('dir') else: dir = request.form.get('cat_name') dir = p.get_pinyin(dir, '') keywords = request.form.get('keywords') description = request.form.get('description') cat_sort = request.form.get('cat_sort') status = request.form.get('status') Articles_Cat.query.filter(Articles_Cat.cat_id == cat_id).update({ Articles_Cat.parent_id: parent_id, Articles_Cat.cat_name: cat_name, Articles_Cat.dir: dir, Articles_Cat.keywords: keywords, Articles_Cat.description: description, Articles_Cat.cat_sort: cat_sort, Articles_Cat.status: status }) db.session.commit() return redirect(url_for('admin.article_cat_list'))
def article_cat_add(): if request.method == 'GET': categorys = Articles_Cat.query.all() # 取得所有分类 list = [] data = {} for cat in categorys: data = dict(cat_id=cat.cat_id, parent_id=cat.parent_id, cat_name=cat.cat_name) list.append(data) data = build_tree(list, 0, 0) print(data) # print(list) html = build_table(data, parent_title='顶级菜单') # print(html) return render_template('admin/article_cat.html', message=html) # article_cat.html else: form = Article_cat(request.form) p = Pinyin() dir = request.form.get('dir') print(dir) if form.validate(): parent_id = request.form.get('parent_id') cat_name = request.form.get('cat_name') dir = request.form.get('dir') check = request.form.get('check') if check: dir = request.form.get('cat_name') dir = p.get_pinyin(dir, '') else: if dir: dir = request.form.get('dir') else: dir = request.form.get('cat_name') dir = p.get_pinyin(dir, '') keywords = request.form.get('keywords') description = request.form.get('description') cat_sort = request.form.get('cat_sort') status = request.form.get('status') insert = Articles_Cat(parent_id=parent_id, cat_name=cat_name, dir=dir, keywords=keywords, description=description, cat_sort=cat_sort, status=status) db.session.add(insert) db.session.commit() return redirect(url_for('admin.article_cat_list')) else: print("校验没有通过") return "校验没通过"
def write_to_excel(poilist, cityname, classfield, coord): # 一个Workbook对象,这就相当于创建了一个Excel文件 book = xlwt.Workbook(encoding='utf-8', style_compression=0) sheet = book.add_sheet(classfield, cell_overwrite_ok=True) # 第一行(列标题) sheet.write(0, 0, 'lon') sheet.write(0, 1, 'lat') sheet.write(0, 2, 'name') sheet.write(0, 3, 'address') sheet.write(0, 4, 'pname') sheet.write(0, 5, 'cityname') sheet.write(0, 6, 'business_area') sheet.write(0, 7, 'type') for i in range(len(poilist)): location = poilist[i]['location'] name = poilist[i]['name'] address = poilist[i]['address'] pname = poilist[i]['pname'] cityname = poilist[i]['cityname'] business_area = poilist[i]['business_area'] type = poilist[i]['type'] lng = str(location).split(",")[0] lat = str(location).split(",")[1] if (coord == "2"): result = gcj02_to_wgs84(float(lng), float(lat)) lng = result[0] lat = result[1] if (coord == "3"): result = gcj02_to_bd09(float(lng), float(lat)) lng = result[0] lat = result[1] # 每一行写入 sheet.write(i + 1, 0, lng) sheet.write(i + 1, 1, lat) sheet.write(i + 1, 2, name) sheet.write(i + 1, 3, address) sheet.write(i + 1, 4, pname) sheet.write(i + 1, 5, cityname) sheet.write(i + 1, 6, business_area) sheet.write(i + 1, 7, type) # 最后,将以上操作保存到指定的Excel文件中 p = Pinyin() p.get_pinyin(cityname) path = "data/poi/" + p.get_pinyin(cityname) + "-" + p.get_pinyin( classfield) + '.xls' book.save(r'' + os.getcwd() + "/" + path) return path
class Fuzzname(object): def __init__(self): self.fuzzymap = dict() self.pinyin = Pinyin() def fit(self, namelis): self.fuzzymap = dict() for name in namelis: self.fuzzymap[name + " " + self.pinyin.get_pinyin(name, '')] = name def predict(self, name): namepin = name + " " + self.pinyin.get_pinyin(name, '') res = process.extractOne(namepin, self.fuzzymap.keys()) res = self.fuzzymap[res[0]] return res
def generateSpell(): beanList = [] workbook = xlrd.open_workbook(os.getcwd() + '\\data.xlsx') sheet = workbook.sheets()[0] rows = sheet.nrows for row in range(1, rows): bean = Bean() bean.mean = sheet.cell_value(row, 0).strip().replace('/', '-') bean.speech = sheet.cell_value(row, 1).strip() bean.slice = sheet.cell_value(row, 2).strip() beanList.append(bean) for bean in beanList: p = Pinyin() spellList = p.get_pinyin(bean.slice, " ") phonetices = spellList.split(" ") str1 = str(phonetices).replace('[', '{') str2 = str1.replace(']', '}') str3 = str2.replace('\'', '\"') str4 = str3.replace(' ', '') bean.spell = str4 i = 1 workbook = xlsxwriter.Workbook(os.getcwd() + '\\data_former.xlsx') ws = workbook.add_worksheet(u'Sheet1') ws = workbook.get_worksheet_by_name('Sheet2') ws.write(i, 3, 'test') for bean in beanList: ws.write(i, 0, bean.mean) ws.write(i, 1, bean.speech) ws.write(i, 2, bean.slice) i = i + 1 workbook.close() return beanList
def addWeather(self,cityList): for city in cityList: url = 'https://www.apiopen.top/weatherApi?city={}'.format(city) response = urlopen(quote(url,safe=string.printable)) responseStr = response.read() responseJson = json.loads(responseStr) print(responseJson) p = Pinyin() city = p.get_pinyin(city) city = city.replace('-','') self.cursor.execute('create table if not exists {}(date text,high text,low text)'.format(city)) self.con.commit() list = [] list.append(responseJson['data']['yesterday']['date']) list.append(responseJson['data']['yesterday']['high']) list.append(responseJson['data']['yesterday']['low']) self.cursor.execute('INSERT INTO {} VALUES ("{}","{}","{}")'.format(city, list[0], list[1], list[2])) self.con.commit() for index in responseJson['data']['forecast']: list = [] list.append(index['date']) list.append((index['high'])) list.append(index['low']) self.cursor.execute('INSERT INTO {} VALUES ("{}","{}","{}")'.format(city,list[0],list[1],list[2])) self.con.commit()
def airMysqlDataFransform(): conn = pymysql.connect(host='127.0.0.1', \ user='******', password='******', \ db='testdata', charset='utf8', \ use_unicode=True) # 输入的地点数据文件 placeItemsFile = '../data/placeItems.csv' # 查询出所有的监测空气的地区 placeSql = "SELECT DISTINCT(monitoring_point) from air_quality" placeItems = pd.read_sql(placeSql, con=conn) # 文件名用拼音 p = Pinyin() # 存储监测空气的地区 placeItems.to_csv(placeItemsFile) # 当前时间 print("Time:", datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')) for indexs in placeItems.index: placeItem = placeItems.loc[indexs].values[0] # 去掉空值 if (placeItem != None): # 预处理表air_quality placeItem = p.get_pinyin(placeItem, '') airSql = handleAirDataSql(placeItem) # 数据查询转换 AQIData = pd.read_sql(airSql, con=conn) placeAir = "../airData/" + placeItem + ".csv" AQIData.to_csv(placeAir, index=False) print("placeAir to csv finished")
def getGameLink(word, p=1): '''递归''' pyinyin = Pinyin() eqKey = pyinyin.get_pinyin(word, "_") url = "https://store.steampowered.com/search/?term=" + word + "&page=" + str( p) print(url) if p == 1: if (not hasattr(getGameLink, 'total')): # hasattr函数的第一个变量为当前函数名,第二个为变量名,加单引号 getGameLink.total = getTotal(url) # 注意之后使用这个变量时一定要在变量名前加 函数名. r = requests.get(url, common.getHeaders()) soup = BeautifulSoup(r.text, "lxml") if p == getGameLink.total: return True # print(p) # print(getGameLink.total) # 获取链接,保存 links = getApplinks(soup) sleep_sec = random.randint(1, 20) print("关键词:%s 共计 %d 页 | 当前采集第 %d页;采集链接数:%d | 休眠 %d 秒" % (word, getGameLink.total, p, len(links), sleep_sec)) add_eq(eqKey, links) time.sleep(sleep_sec) p = p + 1 getGameLink(word, p)
def read_from_sentence_txt(start, emission, transition): ## ./result/sentence.txt print('read from sentence.txt') with codecs.open(SENTENCE_FILE, 'r', 'utf-8') as fin: while True: line = fin.readline() if not line: break line = regex.sub(u"[_《》“”]", r"", line.strip().split('\t')[2]) if line[-1] in [',', ':', '?', '!', '。']: line = line[:-1] if len(line) < 2: continue ## for start start.setdefault(line[0], 0) start[line[0]] += 1 ## for emission pinyin = Pinyin() pnyns = pinyin.get_pinyin(line, " ").split() hanzis = [c for c in line] # print(pnyns, hanzis) for hanzi, pinyin in zip(hanzis, pnyns): emission.setdefault(hanzi, {}) emission[hanzi].setdefault(pinyin, 0) emission[hanzi][pinyin] += 1 ## for transition for f, t in zip(line[:-1], line[1:]): transition.setdefault(f, {}) transition[f].setdefault(t, 0) transition[f][t] += 1
def fromCodeToName(factors, codes): # 准备数据 name = factors[factors.index.isin(codes)].name.values # 将汉字转换为拼音 p = Pinyin() names = [p.get_pinyin(s) for s in name] return names
def main(): P = Pinyin() with open('aqi.json', 'r', encoding='utf-8') as f: aqi = json.load(f) for key in aqi.keys(): print(key) prov_name = P.get_pinyin(key, '').capitalize() prov_name_path = Path.cwd().joinpath('aqi').joinpath(prov_name) # if not prov_name_path.exists(): prov_name_path.mkdir() # print(prov_name_path) if not os.path.exists(prov_name_path.as_posix()): os.makedirs(prov_name_path.as_posix()) for city_name, city_values in aqi[key].items(): print(city_name) city_list = [] for cv in city_values.values(): city_list.extend(cv) city_list = pd.DataFrame(np.array(city_list).reshape(-1, 9), columns=[ 'date', 'aqi', 'aqi_rank', 'pm25', 'pm10', 'so2', 'no2', 'co', 'o3' ]).set_index('date') csv_name = prov_name_path.joinpath(city_name + '.csv') if not os.path.exists(csv_name.as_posix()): city_list.to_csv(csv_name)
def fromCNToEN(name): # 准备数据 #name = factors[factors.index.isin(codes)].name.values # 将汉字转换为拼音 p = Pinyin() names = [p.get_pinyin(s) for s in name] return names
def content_parse(self, response): item = PostDetail() url = response.url url = url.split('/') _id = url[-1].split('.') id = 'T' + _id[0] item['_id'] = 'T' + _id[0] # item['title'] = response.xpath('//*[starts-with(@class, "h1_01")]/text()').extract()[0] spans = response.xpath( '//*[starts-with(@class, "details_01_l")]/span/text()').extract() if len(spans) == 3: item['author'] = spans[1] else: item['author'] = '佚名' tags = response.xpath( '//div[starts-with(@class, "lab_01 m_t_40")]/a/text()').extract() item['tag'] = tags for tag in tags: tag = tag.encode('utf-8') p = Pinyin() tags_id = p.get_pinyin(unicode(tag, "utf8"), '') tags_db = db.Tags tagmap_db = db.Tagmap if not tags_db.find_one({"_id": tags_id}): tag_data = {'_id': tags_id, 'tagname': tag} tags_db.insert(tag_data) if not tagmap_db.find_one({"_id": tags_id}): id_arr = [] id_arr.append(id) tagmap_data = {'_id': tags_id, 'post_id': id_arr} tagmap_db.insert(tagmap_data) else: tagmap_db.update({"_id": tags_id}, {"$addToSet": { "post_id": id }}) p_list = response.xpath( '//*[starts-with(@class, "content_01 m_t_30 detasbmo")]/p') p_content = [] for p in p_list: if p.xpath('img/@src').extract(): p_content.append({ 'type': 'img', 'body': p.xpath('img/@src').extract() }) else: p_content.append({ 'type': 'text', 'body': p.xpath('text()').extract() }) item['content'] = p_content item['comment'] = [] # 评论 类型为一个列表 item['coll'] = 'PostDetail' yield item
def excel_to_mysql(ex_path, table_name): engine = create_engine('mysql+pymysql://root:111111@localhost:3306/oceans') # 读取本地CSV文件 df = pd.read_excel(ex_path, encoding="utf-8", sep="\t") #将新建的DataFrame储存为MySQL中的数据表,不储存index列(index=False) # if_exists: # 1.fail:如果表存在,啥也不做 # 2.replace:如果表存在,删了表,再建立一个新表,把数据插入 # 3.append:如果表存在,把数据插入,如果表不存在创建一个表!! # pd.io.sql.to_sql(df, 'example', con=engine, index=False, if_exists='replace') columns0 = df.columns.tolist() from xpinyin import Pinyin p = Pinyin() # default splitter is `-` columns1 = list(map(lambda x: p.get_pinyin(x, ""), columns0)) df.columns = columns1 rows = df.shape[0] dt = datetime.datetime.now().strftime('%Y/%m/%d %H:%M') df1 = pd.DataFrame({'time': [dt for i in range(rows)]}) df2 = pd.concat([df, df1], axis=1) print(df2) df2.to_sql(table_name, con=engine, index=True, index_label='id', if_exists='replace')
def insertData(): print "insertData"; try: file = open("citys.txt", "r");# w a wb二进制 cursor = conn.cursor(); sql = "truncate table weather_citys"; cursor.execute(sql); cursor.execute("SET NAMES utf8"); cursor.execute("SET CHARACTER_SET_CLIENT=utf8"); cursor.execute("SET CHARACTER_SET_RESULTS=utf8"); conn.commit(); fileList = file.readlines(); p = Pinyin(); date = int(time.mktime(datetime.datetime.now().timetuple())); bz = 1; for fileLine in fileList: cityInfo = fileLine.split("="); cityCode = cityInfo[0]; cityName = cityInfo[1]; spellName = p.get_pinyin(cityName.decode("utf-8"), ''); sql = "insert into weather_citys(cityCode,cityName,spellName,date,bz) values ('%s','%s','%s','%s','%s')" % (cityCode,cityName,spellName.encode("utf-8"),date,bz); cursor.execute(sql); conn.commit(); file.close(); cursor.close(); conn.close(); except (mdb.Error, IOError), e: print "Error %d: %s" % (e.args[0], e.args[1]); sys.exit(1);
def _load_term_to_db(fileName): p = Pinyin() count = 0 with open(fileName) as fp: for line in fp: print count count += 1 if(count < 3225): continue try: line = line.decode('gbk') except Exception, err: print line, err continue terms = line.split() for term in terms: if len(term) <= 1: continue pinyin = p.get_pinyin(term, "") cond = {"_id": term} if db.get_term(cond): to = {"$inc": {"accFrequence": 1}} db.update_term(cond, to) else: data = {"_id": term, "pinyin": pinyin, "accFrequence": 1, "queryFrequence": 0} db.insert_term(data)
def Change_ToPinYin(work): test = Pinyin() # print(work) string_q = test.get_pinyin(u"" + work, '') string_q = string_q.replace(' ', '_') # print(string_q) return string_q
def moveButton(self, button_name="成都市"): pinyin = Pinyin() city_name = re.sub('-', '', pinyin.get_pinyin(button_name)) # ui.City_Dict[city_name] 城市按钮的号数 ui.City_Button[ui.City_Dict[city_name]].move( 2 + 180 * ui.City_Dict[city_name], 6) ui.City_Button[ui.City_Dict[city_name]].setStyleSheet( "border-radius:10px;" "text-align:bottom;" "padding-bottom:2px;" f"background-image:url(./city_pictures/{city_name}.png);" "color: rgb(97, 101, 247);font:Bold 12.5pt '等线';") # 记录新按下的按钮作为老的按钮,下一次按下其他按钮时,可以对老按钮进行还原 # 它就像一个开关,按下新按钮时关闭旧按钮 self.button_flag["old"] = self.button_flag["new"] # 当按下同一个按钮时不做按钮还原操作 if self.button_flag["old"] != city_name + "_" + str( ui.City_Dict[city_name]): # 按钮还原函数 self.Reduction(self.button_flag) self.button_flag["old"] = self.button_flag["new"] self.button_flag["new"] = city_name + "_" + str( ui.City_Dict[city_name]) self.weatherInfo(button_name) # 天气温度信息获取整理 weatherInfo # 以下函数是按下城市按钮,其他控件做出的相应 self.airButton() # 生成新的网页,并在软件界面刷新新的网页 self.createHtml(button_name)
def getFullPin(): print u'[-]姓名转全拼' pinyin = Pinyin() for name in cnnames: py = pinyin.get_pinyin(name.strip(), '') print '{}---{}'.format(name.strip().decode('utf-8').encode('gbk'), py) username.add(py)
def main(): # provice_file_path = r"C:\Users\my\Desktop\SPIDER\20_保险公司\各省HTML\青海.txt" # prefix_url = "http://qinghai.circ.gov.cn/tabid" # Province = "青海" for file in os.listdir("./各省HTML"): provice_file_path = os.path.join( r"C:\Users\my\Desktop\SPIDER\20_保险公司\各省HTML", file) name = file.split(".")[0] pin = Pinyin() prefix_url = "http://{}.circ.gov.cn/tabid".format( pin.get_pinyin(name).replace("-", '')) urls = get_urls(prefix_url, provice_file_path) l = len(urls) count = 1 for url in urls: time.sleep(0.2) print(count, l, url) count += 1 try: insurance_info = parse_url(url) insurance_info["Province"] = name save_to_db(insurance_info) except Exception as e: logger.info(url) logger.info(e) continue
def editnode(request): if "name" not in request.session: return HttpResponseRedirect('/login') username = request.session["email"].split('@')[0] treeid = request.GET.get('treeid',) zh_name = request.GET.get('name',) p = Pinyin() en_name = p.get_pinyin(zh_name,'') Mtree.objects.filter(id=int(treeid)).update(zh_name=zh_name,en_name=en_name) return HttpResponse('ok')
def word2pinyin(word): from xpinyin import Pinyin p = Pinyin() #word='寿宝庄' if isinstance(word,unicode)==False: word=unicode(word,'utf-8') ping=p.get_pinyin(word,' ') #print ping #'shou bao zhuang' return ping.replace(' ','')
def load(self, lashou_city_list): result = [] p = Pinyin() tree = ET.parse(lashou_city_list) root = tree.getroot() for city in root.iter('city'): city_id = city.find('id').text city_name = city.find('name').text result.append( [city_id, city_name, p.get_pinyin(city_name)] ) return result
def welcome(request): if request.user.is_authenticated(): return redirect('/') appname = u"延伸" pagename ='welcome' if request.method == 'POST': username = request.POST['username'] password = request.POST['password'] action = request.POST['action'] if username == '' or password =='' or action == '': message = '抱歉,服务器开小差了,注册失败。' return locals() if action == 'login': user = authenticate(username=username, password=password) if user is not None: if user.is_active: login(request, user) return redirect('/') else: message = '账户已被注销。' return locals() else: message = 'Email 或密码错误。' return locals() elif action == 'reg': if Profile.objects.filter(email=username).exists(): message = '您已经注册过了。' return locals() else: try: user = Profile.objects.create_user(email=username, password=password) user.first_name = request.POST['firstname'] user.last_name = request.POST['lastname'] p = Pinyin() user.pinyin = p.get_pinyin(user.last_name + user.first_name, ' ') for char in user.last_name + user.first_name: user.pinyin += p.get_initials(char) user.contact_info = json.JSONEncoder().encode({"next_id":2, "data":[{"info_id":1, "type":"Email", "key": u"电子邮箱", "value": user.email}]}) user.save() category = Category.objects.create(name=u'未分组', owner=user, privilege='{"1":false}') category.save() except ValidationError: message = '请输入正确的 Email 地址。' return locals() if user: user = authenticate(username=username, password=password) login(request, user) return redirect('/me/?first=1') else: message = '抱歉,服务器开小差了,注册失败。' return locals() else: return locals()
def construct_city_set(self, lashou_city_list, unlisted): result = [] p = Pinyin() tree = ET.parse(lashou_city_list) root = tree.getroot() for city in root.iter('city'): city_name = city.find('name').text result.append( [city_name, p.get_pinyin(city_name)] ) fhandler = open(unlisted, "r") lines = fhandler.readlines() fhandler.close() for line in lines: city_name = line.strip() city_name = unicode( city_name, "utf-8" ) result.append( [city_name, p.get_pinyin(city_name)] ) city_index = 1 for city in result: print city_index, "\t", city[0], "\t", city[1] city_index = city_index + 1
def query_fulltext_code(code): pinyin = Pinyin() initials = [] for letter in pinyin.get_initials(code, splitter=' ').lower().split(' '): if letter.isalpha(): initials.append(letter * 4) logging.debug(initials) analysed_code = pinyin.get_pinyin(code, splitter=u' ') + ' ' + ' '.join(initials) logging.debug(analysed_code) clause = "MATCH(`codepinyin`, `codepinyininitials`) AGAINST (%s)" query = yield dbutil.do(Event.select(SQL('*, ' + clause + ' AS similarity', analysed_code)).where( SQL(clause, analysed_code)).limit(4).dicts) events = [event for event in query] logging.debug(events) return events
def slugify(text): """Version of slugify that supports Japanese and Chinese characters""" if not text: return "" slug = django_slugify(text) if not slug: # Title may be in Japanese slug = django_slugify(romkan.to_roma(text)) if not slug: # Title may be in Chinese pinyin = Pinyin() slug = django_slugify(pinyin.get_pinyin(text)) if not slug: # Try transliterate which supports Cyryllic, Greek and other alphabets slug = django_slugify(translit(text, reversed=True)) return slug[:50]
def translation_cn_to_en(self, cn): p = Pinyin() only = re.compile(ur"[\u4e00-\u9fa5a-z0-9A-Z/-]+") cn = only.findall(cn) pinyin = p.get_pinyin(cn[0], '') self.cur.execute("select * from map where pinyin like '%s%%'" % pinyin) data = self.cur.fetchall() print data if len(data) == 0: return False, u"%s 星系不存在" % cn[0], "" else: en = data[0][1].strip(" ") en = en.replace(" ", "_", data.count(" ")) cn = data[0][2].strip(" ") number = data[0][0] return en , cn, number
def __init__(self, email, password): self._session = BangumiSession(email, password) self._colls = self._session.get_dummy_collections('anime', 3) # add pinyin to valid titles and setup auto correction behaviors pinyin = Pinyin() for coll in self._colls: if not coll.subject.ch_title: continue pinyin_title = pinyin.get_pinyin(coll.subject.ch_title, '') if not coll.subject.other_info.get('aliases'): coll.subject.other_info['aliases'] = [pinyin_title] else: coll.subject.other_info['aliases'].append(pinyin_title) corrections.update({pinyin_title: coll.subject.ch_title}) self._titles = set() self._update_titles()
def create_univs(): if DBSession.query(Univs).count() == 0: p = Pinyin() univs_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "univs.txt") with codecs.open(univs_path, "r", "utf8") as unvis_file: for line in unvis_file: line = line[0:len(line)-1] cc = '#' py = p.get_pinyin(line, cc) p1 = "" p2 = "" if cc in py: p2 = ''.join([c[0] for c in py.split(cc)]) p1 = py.replace(cc,'') u = Univs(name=line, pinyin=p1, pprev=p2) DBSession.add(u) DBSession.flush()
def get_name(input_file, output_file): # get chinese name and transform into Pinyin print "Please enter your name in Chinese!" name_in_chinese = raw_input("> ").decode(sys.stdin.encoding or locale.getpreferredencoding(True)) p = Pinyin() # create instance of Pinyin class name_in_pinyin = p.get_pinyin(name_in_chinese, '') # translate chinese into Pinyin and get rid of '-' input_fd = open(input_file) input_fd.readline() # open file and read the first line lessons = [u"框架原理","Python", u"用例编写","Keywordsrunner", "Selenium"] total_lessons = len(lessons) line_num = 1 # count the row you are in while True: lesson_record = input_fd.readline().strip() lesson_record_list = lesson_record.split(',') line_num += 1 if lesson_record == '': print "Cannot find your name." input_fd.close() return else: if lesson_record_list[1] == name_in_pinyin: print u"%s在第%d行,报名课程有:" % (name_in_chinese, line_num), context = u"%s在第%d行,报名课程有:" % (name_in_chinese, line_num) count = 0 for i in range(total_lessons): # Lesson fields started from 3rd column if lesson_record_list[i + 2] == '1': count += 1 print "%s," % lessons[i], context = context + "%s, " % lessons[i] print u"总共%d课." % count context = context + u"总共%d课." % count input_fd.close() output_fd = open(output_file, 'wb') output_fd.write(context.encode('gbk')) output_fd.close() return
def get_pinyin(self, df, series, inplace=True): """ Will translate chinese characters into latin alphabet :param inplace: <BOOLEAN> If we want to replace the initial string by the translated string we don't touch this param but if we want to keep old data then this need to be false :param df: <DF> Pandas DataFrame :param series: <LIST<STRING>> list of serie to translate into latin alphabet :return: <DF> Pandas DataFrame """ data = df.to_dict('records') p = Pinyin() for serie in series: if df[serie].dtypes != object: continue for idx, el in enumerate(data): new_serie = serie if inplace else serie + "_translate" data[idx][new_serie] = p.get_pinyin(el[serie], '') return pd.DataFrame(data)
def people(filename): space = u" " with codecs.open(filename, encoding="utf-8") as infile: names = [line.strip() for line in infile] td_format = u"<td width=\"172\">{0}(<a href=\"javascript:toto('{1}')\"><img src=\"img/email.gif\" /></a>)</td>" for index, name in enumerate(names): p = Pinyin() pinyin = p.get_pinyin(name," ").split() if len(name) == 2: pinyin_abbrev = pinyin[0] + pinyin[1][0] name = name[0] + space + name[1] elif len(name) >= 3: pinyin_abbrev = pinyin[0] + "".join(item[0] for item in pinyin[1:]) td = td_format.format(name, pinyin_abbrev) print td if (index+1) % 3 == 0 and index < len(names)-1: print print "<td width=\"98\"> </td>"
def getCities(): '''getCities''' cursor = conn.cursor(); tableName = "pm2_5_cities"; cursor.execute("truncate table %s" % tableName); cursor.execute("SET NAMES utf8"); cursor.execute("SET CHARACTER_SET_CLIENT=utf8"); cursor.execute("SET CHARACTER_SET_RESULTS=utf8"); conn.commit(); saveFileName = "insert_cities.sql"; file = codecs.open(saveFileName, "w", 'utf-8'); fileName = "cities.json"; jsons = readJson(fileName); if(type(jsons) == DictionaryType and jsons.has_key("error")): print "no data"; return; cities = jsons["cities"]; p = Pinyin(); for city in cities: cityCode = ""; cityName = city; spellName = p.get_pinyin(cityName, ''); date = int(time.time()); bz = 1; columns = "cityCode,cityName,spellName,`date`,bz"; values = "'%s','%s','%s',%d,%d" % (cityCode,cityName,spellName,date,bz); sql = "insert into %s(%s) values (%s)" % (tableName,columns,values); sqlstring = "insert into %s(%s) values (%s);\n" % (tableName,columns,values); file.write(sqlstring); cursor.execute(sql.encode("utf-8")); conn.commit(); file.close(); cursor.close(); conn.close();
def align(sent): ''' Args: sent: A string. A sentence. Returns: A tuple of pinyin and chinese sentence. ''' pinyin = Pinyin() pnyns = pinyin.get_pinyin(sent, " ").split() hanzis = [] for char, p in zip(sent.replace(" ", ""), pnyns): hanzis.extend([char] + ["_"] * (len(p) - 1)) pnyns = "".join(pnyns) hanzis = "".join(hanzis) assert len(pnyns) == len(hanzis), "The hanzis and the pinyins must be the same in length." return pnyns, hanzis
def addnode(request): if "name" not in request.session: return HttpResponseRedirect('/login') username = request.session["email"].split('@')[0] zh_name = request.GET.get('zh_name',) pid = request.GET.get('pid',) p = Pinyin() en_name = p.get_pinyin(zh_name,'') pnode = Mtree.objects.get(id=pid) pgen = pnode.gen pdeep = pnode.deep deep = pdeep + 1 ret = Mtree.objects.create(pid=pid,deep=deep,zh_name=zh_name,en_name=en_name) id = ret.id gen = pgen + '_' + str(id) Mtree.objects.filter(id=id).update(gen=gen) #Mtree.objects.filter(id=int(treeid)).delete() showid = 2 if deep == 4: showid = 1 data = {'id':id,'showid':showid} data = json.dumps(data) return HttpResponse(data)
def post(self): try: data = json.loads(self.request.body.decode()) logging.debug(data) event = dict_to_model(Event, data) event.createtime = datetime.datetime.now() pinyin = Pinyin() event.codepinyin = pinyin.get_pinyin(event.code, splitter=' ') initials = [] for letter in pinyin.get_initials(event.code, splitter=' ').lower().split(' '): if letter.isalpha(): initials.append(letter * 4) event.codepinyininitials = ' '.join(initials) yield dbutil.do(event.save) self.write(Response( status=1, msg='恭喜你,活动发布成功!', result={} ).json()) ioloop.IOLoop.current().spawn_callback(event_service.create_qrcode, event) except Exception as e: self.write(Response(msg='sorry,亲,活动发布失败').json()) logging.exception('CreateEventHandler error: {0}'.format(str(e)))
def pre_save(self, instance, add): # get currently entered slug value = self.value_from_object(instance) # autopopulate if self.always_update or (self.populate_from and not value): value = utils.get_prepopulated_value(self, instance) if __debug__ and not value: print 'Failed to populate slug %s.%s from %s' % \ (instance._meta.object_name, self.name, self.populate_from) # slug = self.slugify(value) # Convert Chinese characters to letters from xpinyin import Pinyin pinyin = Pinyin() slug = pinyin.get_pinyin(value, ) if not slug: # no incoming value, use model name slug = instance._meta.module_name assert slug, 'slug is defined before trying to ensure uniqueness' slug = utils.crop_slug(self, slug) # ensure the slug is unique (if required) if self.unique or self.unique_with: slug = utils.generate_unique_slug(self, instance, slug) assert slug, 'value is filled before saving' # make the updated slug available as instance attribute setattr(instance, self.name, slug) return slug
def parse(self): p = Pinyin() s = Hyphenator('en_US') with codecs.open(self.filepath, encoding='utf-8', mode='r') as f: for line in f: self.count = self.count + 1 line = line[0:-1] words = line.split() if len(words) != 2: print "Error on line", self.count raise ValueError c = words[0].strip() e = words[1].strip().lower() self.ch.append(c) self.pinyin.append(p.get_pinyin(c, ' ').split()) self.en.append(e) if len(e) > 3: syll= s.syllables(e) syll = self.sub_syllables(e, c, syll) else: syll = [e] self.syllables.append(syll)
from xpinyin import Pinyin p = Pinyin() v = p.get_pinyin("和面") print(v)
# -*- coding: utf-8 -*- from xpinyin import Pinyin import sys import re p = Pinyin() #print p.get_pinyin(u"上海") for line in sys.stdin: word = line.strip() word = word.decode('utf-8') if ' ' in word: continue if '\t' in word: continue #for c in [u'(', u')', u'(', u')']: #if c in word: #continue word_pinyin = p.get_pinyin(word, u' ') if re.match(r'^[a-z ]+$', word_pinyin): sys.stdout.write(('%s\t%s\t1\n' % (word, word_pinyin)).encode('utf-8')) else: continue
#先下载xpinyin模块, http://pypi.python.org/pypi/xpinyin/ from xlrd import open_workbook, cellname from xlutils.copy import copy from xpinyin import Pinyin rb = open_workbook('hanzi.xls') rs = rb.sheet_by_index(0) wb = copy(rb) ws = wb.get_sheet(0) p = Pinyin() for x in range(rs.nrows): hanzi= rs.cell(x, 0).value pinyin = p.get_pinyin(hanzi) ws.write(x, 1, pinyin) wb.save('pinyin.xls') raw_input("press any key to exit")
def term_pinyin(word): word = digi_to_py(word) p = Pinyin() result = p.get_pinyin(word, ' ') return result
def insertData(): print "insertData"; try: file = open("citys.txt", "r");# w a wb二进制 cursor = conn.cursor(); sql = "truncate table dict_citys"; cursor.execute(sql); cursor.execute("SET NAMES utf8"); cursor.execute("SET CHARACTER_SET_CLIENT=utf8"); cursor.execute("SET CHARACTER_SET_RESULTS=utf8"); conn.commit(); fileList = file.readlines(); p = Pinyin(); date = int(time.mktime(datetime.datetime.now().timetuple())); bz = 1; cityCode = ""; cityName = ""; spellName = ""; level = 1;#1 - 省级 2 - 市级 3 - 县级 customCode = ""; parentCode = ""; parentId = ""; spaceCount = 0; space = " "; lastSpaceCount = 0; provinceIndexCount = 0; cityIndexCount = 0; countyIndexCount = 0; provinceCode = ""; customCityCode = ""; for fileLine in fileList: #print fileLine; spaceCount = fileLine.count(" "); if(spaceCount == 4): provinceIndexCount += 1; parentCode = "0"; parentId = 0; customCode = str(provinceIndexCount).zfill(3); provinceCode = customCode; space = " "; level = 1; elif(spaceCount == 6): if(lastSpaceCount == 4): cityIndexCount = 0; cityIndexCount += 1; parentCode = provinceCode; customCode = provinceCode + str(cityIndexCount).zfill(3); customCityCode = customCode; space = " "; level = 2; elif(spaceCount == 8): if(lastSpaceCount == 6): countyIndexCount = 0; countyIndexCount += 1; parentCode = customCityCode; customCode = customCityCode + str(countyIndexCount).zfill(3); space = " "; level = 3; cityInfo = fileLine.split(space); cityCode = cityInfo[0]; cityName = cityInfo[1]; spellName = p.get_pinyin(cityName.decode("utf-8"), ''); sql = "insert into dict_citys(cityCode,cityName,spellName,customCode,parentCode,parentId,date,bz) values ('%s','%s','%s','%s','%s',%d,'%s','%s')" % (cityCode,cityName,spellName.encode("utf-8"),customCode,parentCode,parentId,date,bz); cursor.execute(sql); if(spaceCount == 4): parentId = conn.insert_id(); elif(spaceCount == 6): parentId = conn.insert_id(); conn.commit(); lastSpaceCount = spaceCount; file.close(); cursor.close(); conn.close(); except (mdb.Error, IOError), e: print "Error %d: %s" % (e.args[0], e.args[1]); sys.exit(1);
from xpinyin import Pinyin pinyinLib = Pinyin() with open( 'D:\\text.txt', 'r' ) as srcFile, open( 'D:\\pinyin.txt', 'w' ) as desFile: for line in srcFile: parenthesisLength = [] words = line.split() for word in words: result = pinyinLib.get_pinyin( word, splitter = ' ', show_tone_marks = True ) desFile.writelines( result + '\t\t' ) parenthesisLength.append( len(result) ) desFile.writelines( '\n' ) for pLength in parenthesisLength: desFile.writelines( '(' + ''.join( [' ']*(pLength - 2) ) + ')' + '\t\t' )
return data es = ElasticSearch('http://localhost:9200/') es.delete_index('pet') spider = Spider() breeds = spider.getPetBreeds() p = Pinyin() for breed in breeds: flg = 1 page = 1 pet_list = [] while(flg): pets = spider.getPets(breed, (page - 1) * spider.limit) if not pets: flg = 0 else: page = page + 1 for pet in pets: pet_obj = {} pet_obj['name'] = pet['name'] pet_obj['img'] = pet['img'] pet_obj['type'] = breed['ename'] pet_list.append(pet_obj) #print pet['name'] + '\t' + p.get_pinyin(pet['name'], '') print breed['ename'] + '\n' if not pet_list: continue doc_type = p.get_pinyin(breed['ename'].replace('宠物', ''), '') es.bulk((es.index_op(pet_obj) for pet_obj in pet_list), doc_type=doc_type, index = 'pet') es.refresh('pet')