def download_list_page(self, urlmd5, url, proxy, boundary, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if is_json(page): json_page = json.loads(page) result = json_page["result"] total_count = result["total"] print ("total:"+str(total_count)) if int(total_count) <= 10 and int(total_count)>0: content = json_page["content"] for item in content: uid = item["uid"] primary_uid = item["primary_uid"] new_url = self.page_url % (uid, primary_uid) new_urlmd5 = to_md5(in_str=new_url) url_type = 0 boundary = None status = 0 sql = "select * from " + self.url_table + " where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary,status)]) db.close() else: print "This url is already in the database!!" elif int(total_count) <= 0: pass else: min_interval = boundary.split(";")[0] max_interval = boundary.split(";")[1] lat_min = min_interval.split(",")[1] lat_max = max_interval.split(",")[1] lng_min = min_interval.split(",")[0] lng_max = max_interval.split(",")[0] boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 4, 0.2) for _boundary in boundarys: _boundary_st = str(_boundary[1][0])+","+str(_boundary[0][0])+";"+str(_boundary[1][1])+","+str(_boundary[0][1]) new_url = self.list_url % (self.city_code,self.keyword, _boundary_st) new_urlmd5 = to_md5(in_str=new_url) url_type = 1 boundary = _boundary_st status = 0 db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)]) db.close() update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def split_boundary_outline(b_type, city_code, name): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" sql = 'select lng_min,lat_min,lng_max,lat_max from ' + boundary_table + " where type=" + str( b_type) + " and total_count=400" db = MysqlHandle() query_res = db.query(sql) for (lng_min, lat_min, lng_max, lat_max) in query_res: boundarys = split_boundary(float(lat_max), float(lat_min), float(lng_max), float(lng_min), 10, 0.1) for _boundary in boundarys: _lng_min = _boundary[1][0] _lat_min = _boundary[0][0] _lng_max = _boundary[1][1] _lat_max = _boundary[0][1] _boundary_st = str(_boundary[0][0]) + "," + str( _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str( _boundary[1][1]) md5 = to_md5(_boundary_st) db = MysqlHandle() sql = "insert into " + boundary_table + " values(%s,%s,2,%s,%s,%s,%s,0,0,now())" db.insert( sql, [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]]) db.close()
def add_init_url(self, url_table_name, filter_config, city_code, keyword): list_url = 'https://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=spot&from=webmap&c=%d&wd=%s&wd2=&pn=0&nn=0&db=0&sug=0&addr=0&pl_data_type=life&pl_sort_type=data_type&pl_sort_rule=0&pl_business_type=cinema&pl_business_id=&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&rn=10&tn=B_NORMAL_MAP&ie=utf-8&b=(%s)' url = filter_config["url"] # db = MysqlHandle() # insert_sql = "insert into " + url_table_name + " values(%s,%s,%s,%s,%s,now())" # db.insert(sql=insert_sql, value_list=[(url["urlmd5"], url["url"], url["type"], url["boundary"], url["status"])]) # db.close() boundary = url["boundary"] min_interval = boundary.split(";")[0] max_interval = boundary.split(";")[1] lat_min = min_interval.split(",")[1] lat_max = max_interval.split(",")[1] lng_min = min_interval.split(",")[0] lng_max = max_interval.split(",")[0] boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 20, 0.2) for _boundary in boundarys: _boundary_st = str(_boundary[1][0]) + "," + str( _boundary[0][0]) + ";" + str(_boundary[1][1]) + "," + str( _boundary[0][1]) new_url = list_url % (city_code, keyword, _boundary_st) new_urlmd5 = to_md5(in_str=new_url) url_type = 2 boundary = _boundary_st status = 0 db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)]) db.close()
def spider(city_code, name, keyword, key_token): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" base_url = "http://api.map.baidu.com/place/v2/search?query=%s&scope=2&bounds=%s&output=json&ak=%s&page_num=%d" sql = "select md5, boundary from " + boundary_table + " where status=0" db = MysqlHandle() res_data = db.query(sql) for (md5, boundary) in res_data: url = base_url % (keyword, boundary, key_token, 0) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) status = json_data["status"] total = json_data["total"] print(boundary, url, total) if status == 0 and int(total) > 0: page_count = int(total) / 10 for x in range(0, page_count + 2): _url = base_url % (keyword, boundary, key_token, x) downloader = PageDownload() _page = downloader.simple_download(_url) if is_json(_page): _json_data = json.loads(_page) results = _json_data["results"] for item in results: name = item["name"] address = item["address"] province = item["province"] city = item["city"] area = item["area"] uid = item["uid"] _md5 = to_md5(uid) lat = item["location"]["lat"] lng = item["location"]["lng"] try: tag = item["detail_info"]["tag"] except Exception, e: tag = None print(e.message) sql = "insert into " + page_table + " values(%s,%s,%s,%s,null,%s,%s,%s,%s,%s,null,null,%s,null,now(),null)" db = MysqlHandle() db.insert(sql, [[ _md5, uid, name, address, province, city, area, lng, lat, tag ]]) db.close() sql = 'update ' + boundary_table + ' set status=200,total_count=' + str( total) + ' where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close()
def download_list_page(self, urlmd5, url, proxy, url_table, filter_table, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url, filter_table=filter_table) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from " + url_table + " where urlmd5='%s'" % ( new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + url_table + " values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def update_geo_data(uid, l_type, city_code, name): page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" url = "http://map.baidu.com/?qt=ext&newmap=1&uid=%s&c=%d&nn=0&l=%d&ext_ver=new" % ( uid, city_code, l_type) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) if json_data.has_key("content"): content = json_data["content"] if content.has_key("geo"): geo = content["geo"] print(uid) md5 = to_md5(uid) sql = "update " + page_table + ' set geo="' + geo + '" where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close() time.sleep(random.uniform(0.5, 1.0))
def init_spider(city_code, name, boundary): #Initializer(source="bdmap_api_"+name+"_"+str(city_code), table_config="table_config.json", filter_config=None, need_proxy=False) boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" lng_min, lat_min, lng_max, lat_max = boundary boundarys = split_boundary(float(lat_max), float(lat_min), float(lng_max), float(lng_min), 10, 0.1) for _boundary in boundarys: _lng_min = _boundary[1][0] _lat_min = _boundary[0][0] _lng_max = _boundary[1][1] _lat_max = _boundary[0][1] _boundary_st = str(_boundary[0][0]) + "," + str( _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str( _boundary[1][1]) md5 = to_md5(_boundary_st) db = MysqlHandle() sql = "insert into " + boundary_table + " values(%s,%s,1,%s,%s,%s,%s,0,0,now())" db.insert( sql, [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]]) db.close()
def download_list_page(self, urlmd5, url, proxy, domain=None): downloader = PageDownload(proxy=proxy,timeout=10) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) # singer_names = re.findall(self.js0_reg, page) # for singer_name in singer_names: # merge_url = "http://www.51ape.com/skin/ape/php/qx_2.php?qx=" + singer_name # new_urls.append(merge_url) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from "+self.url_table+" where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into "+self.url_table+" values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False