for url in shop_urls: url_open = "http://www.dianping.com" + url if db_shop.find_one({"mallid": 2001, "storeid": filter(str.isdigit, str(url))}): print "p{}-exist: {}".format(i, url_open) count_exi = count_exi + 1 continue print "p{}: {}".format(i, url_open) html = get_html(url_open) shop_info = h2d_dianping(html=html, storeid=url).h2d() print shop_info if shop_info["addr"]: # 获取地理坐标 maximum retry = 3 for i in range(0, 3): loc_coordinates = [] try: loc_coordinates = geocode(shop_info["addr"].encode("utf-8")) except Exception, e: print e if loc_coordinates: loc = [] loc.append(loc_coordinates[0][0]["lng"]) loc.append(loc_coordinates[0][0]["lat"]) shop_info["loc"] = {"type": "Point", "coordinates": loc} print loc break print "geocode retry: " + str(i) db_shop.replace_one({"mallid": shop_info["mallid"], "storeid": shop_info["storeid"]}, shop_info, upsert=True) count = count + 1 if not page_site["next"]: break
"mallid": 2001, "storeid": filter(str.isdigit, str(url)) }): print "p{}-exist: {}".format(i, url_open) count_exi = count_exi + 1 continue print "p{}: {}".format(i, url_open) html = get_html(url_open) shop_info = h2d_dianping(html=html, storeid=url).h2d() print shop_info if shop_info["addr"]: # 获取地理坐标 maximum retry = 3 for i in range(0, 3): loc_coordinates = [] try: loc_coordinates = geocode( shop_info["addr"].encode("utf-8")) except Exception, e: print e if loc_coordinates: loc = [] loc.append(loc_coordinates[0][0]["lng"]) loc.append(loc_coordinates[0][0]["lat"]) shop_info["loc"] = { "type": "Point", "coordinates": loc } print loc break print "geocode retry: " + str(i) db_shop.replace_one( {
db_shop.create_index([("loc", GEOSPHERE)], unique=True, background=True) if __name__ == '__main__': all = db_shop.find() print "all:{}".format(all.count()) count = 0 for shop in all: print "======{}=======".format(count) print shop shop_id = shop["_id"] address = shop["addr"] for i in range(0, 3): loc_coordinates = [] try: #time.sleep(5) loc_coordinates = geocode(address.encode("utf-8")) except Exception, e: print e if loc_coordinates: break print "retry: " + str(i) if loc_coordinates: loc = [] loc.append(loc_coordinates[0][0]["lng"]) loc.append(loc_coordinates[0][0]["lat"]) db_shop.update_one( {"_id": shop_id}, {"$set": { "loc": { "type": "Point", "coordinates": loc