示例#1
0
    def download_list_page(self, urlmd5, url, proxy, boundary, domain=None):
        downloader = PageDownload(proxy=proxy)
        page = downloader.simple_download(url=url)
        if is_json(page):
            json_page = json.loads(page)
            result = json_page["result"]
            total_count = result["total"]
            print ("total:"+str(total_count))
            if int(total_count) <= 10 and int(total_count)>0:
                content = json_page["content"]
                for item in content:
                    uid = item["uid"]
                    primary_uid = item["primary_uid"]

                    new_url = self.page_url % (uid, primary_uid)
                    new_urlmd5 = to_md5(in_str=new_url)
                    url_type = 0
                    boundary = None
                    status = 0
                    sql = "select * from  " + self.url_table + " where urlmd5='%s'" % (new_urlmd5)
                    db = MysqlHandle()
                    results = db.query(sql=sql)
                    db.close()
                    if not results:
                        db = MysqlHandle()
                        insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())"
                        db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary,status)])
                        db.close()
                    else:
                        print "This url is already in the database!!"
            elif int(total_count) <= 0:
                pass
            else:
                min_interval = boundary.split(";")[0]
                max_interval = boundary.split(";")[1]
                lat_min = min_interval.split(",")[1]
                lat_max = max_interval.split(",")[1]
                lng_min = min_interval.split(",")[0]
                lng_max = max_interval.split(",")[0]

                boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 4, 0.2)
                for _boundary in boundarys:
                    _boundary_st = str(_boundary[1][0])+","+str(_boundary[0][0])+";"+str(_boundary[1][1])+","+str(_boundary[0][1])
                    new_url = self.list_url % (self.city_code,self.keyword, _boundary_st)
                    new_urlmd5 = to_md5(in_str=new_url)
                    url_type = 1
                    boundary = _boundary_st
                    status = 0
                    db = MysqlHandle()
                    insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())"
                    db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)])
                    db.close()
            update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5)
            db = MysqlHandle()
            db.update(sql=update_sql)
            db.close()
            return True
        else:
            return False
示例#2
0
def split_boundary_outline(b_type, city_code, name):
    boundary_table = "bdmap_api_" + name + "_" + str(
        city_code) + "_boundary_table"
    sql = 'select lng_min,lat_min,lng_max,lat_max from ' + boundary_table + " where type=" + str(
        b_type) + " and total_count=400"
    db = MysqlHandle()
    query_res = db.query(sql)
    for (lng_min, lat_min, lng_max, lat_max) in query_res:
        boundarys = split_boundary(float(lat_max), float(lat_min),
                                   float(lng_max), float(lng_min), 10, 0.1)
        for _boundary in boundarys:
            _lng_min = _boundary[1][0]
            _lat_min = _boundary[0][0]
            _lng_max = _boundary[1][1]
            _lat_max = _boundary[0][1]
            _boundary_st = str(_boundary[0][0]) + "," + str(
                _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str(
                    _boundary[1][1])
            md5 = to_md5(_boundary_st)
            db = MysqlHandle()
            sql = "insert into " + boundary_table + " values(%s,%s,2,%s,%s,%s,%s,0,0,now())"
            db.insert(
                sql,
                [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]])
            db.close()
示例#3
0
    def add_init_url(self, url_table_name, filter_config, city_code, keyword):
        list_url = 'https://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=spot&from=webmap&c=%d&wd=%s&wd2=&pn=0&nn=0&db=0&sug=0&addr=0&pl_data_type=life&pl_sort_type=data_type&pl_sort_rule=0&pl_business_type=cinema&pl_business_id=&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&rn=10&tn=B_NORMAL_MAP&ie=utf-8&b=(%s)'
        url = filter_config["url"]
        # db = MysqlHandle()
        # insert_sql = "insert into " + url_table_name + " values(%s,%s,%s,%s,%s,now())"
        # db.insert(sql=insert_sql, value_list=[(url["urlmd5"], url["url"], url["type"], url["boundary"], url["status"])])
        # db.close()
        boundary = url["boundary"]
        min_interval = boundary.split(";")[0]
        max_interval = boundary.split(";")[1]
        lat_min = min_interval.split(",")[1]
        lat_max = max_interval.split(",")[1]
        lng_min = min_interval.split(",")[0]
        lng_max = max_interval.split(",")[0]

        boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)),
                                   int(float(lng_max)), int(float(lng_min)),
                                   20, 0.2)
        for _boundary in boundarys:
            _boundary_st = str(_boundary[1][0]) + "," + str(
                _boundary[0][0]) + ";" + str(_boundary[1][1]) + "," + str(
                    _boundary[0][1])
            new_url = list_url % (city_code, keyword, _boundary_st)
            new_urlmd5 = to_md5(in_str=new_url)
            url_type = 2
            boundary = _boundary_st
            status = 0
            db = MysqlHandle()
            insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())"
            db.insert(sql=insert_sql,
                      value_list=[(new_urlmd5, new_url, url_type, boundary,
                                   status)])
            db.close()
示例#4
0
def spider(city_code, name, keyword, key_token):
    boundary_table = "bdmap_api_" + name + "_" + str(
        city_code) + "_boundary_table"
    page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table"
    base_url = "http://api.map.baidu.com/place/v2/search?query=%s&scope=2&bounds=%s&output=json&ak=%s&page_num=%d"
    sql = "select md5, boundary from " + boundary_table + " where status=0"
    db = MysqlHandle()
    res_data = db.query(sql)
    for (md5, boundary) in res_data:
        url = base_url % (keyword, boundary, key_token, 0)
        downloader = PageDownload()
        page = downloader.simple_download(url)
        if is_json(page):
            json_data = json.loads(page)
            status = json_data["status"]
            total = json_data["total"]
            print(boundary, url, total)
            if status == 0 and int(total) > 0:
                page_count = int(total) / 10
                for x in range(0, page_count + 2):
                    _url = base_url % (keyword, boundary, key_token, x)
                    downloader = PageDownload()
                    _page = downloader.simple_download(_url)
                    if is_json(_page):
                        _json_data = json.loads(_page)
                        results = _json_data["results"]
                        for item in results:
                            name = item["name"]
                            address = item["address"]
                            province = item["province"]
                            city = item["city"]
                            area = item["area"]
                            uid = item["uid"]
                            _md5 = to_md5(uid)
                            lat = item["location"]["lat"]
                            lng = item["location"]["lng"]
                            try:
                                tag = item["detail_info"]["tag"]
                            except Exception, e:
                                tag = None
                                print(e.message)
                            sql = "insert into " + page_table + " values(%s,%s,%s,%s,null,%s,%s,%s,%s,%s,null,null,%s,null,now(),null)"
                            db = MysqlHandle()
                            db.insert(sql, [[
                                _md5, uid, name, address, province, city, area,
                                lng, lat, tag
                            ]])
                            db.close()

            sql = 'update ' + boundary_table + ' set status=200,total_count=' + str(
                total) + ' where md5="' + md5 + '"'
            db = MysqlHandle()
            db.update(sql)
            db.close()
示例#5
0
 def download_list_page(self,
                        urlmd5,
                        url,
                        proxy,
                        url_table,
                        filter_table,
                        domain=None):
     downloader = PageDownload(proxy=proxy)
     page = downloader.simple_download(url=url)
     if page is not None:
         new_urls = re.findall(self.reg, page)
         for _url in new_urls:
             if domain is not None:
                 if _url.startswith("/"):
                     new_url = domain + _url
                 else:
                     new_url = _url
             else:
                 new_url = _url
             url_type = self.filter_url(url=new_url,
                                        filter_table=filter_table)
             if url_type is not None:
                 new_urlmd5 = to_md5(in_str=new_url)
                 sql = "select * from  " + url_table + " where urlmd5='%s'" % (
                     new_urlmd5)
                 db = MysqlHandle()
                 results = db.query(sql=sql)
                 db.close()
                 if not results:
                     db = MysqlHandle()
                     insert_sql = "insert into " + url_table + " values (%s,%s,%s,%s,now())"
                     db.insert(sql=insert_sql,
                               value_list=[(new_urlmd5, new_url, url_type,
                                            0)])
                     db.close()
                 else:
                     print "This url is already in the database!!"
             else:
                 pass
         update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % (
             urlmd5)
         db = MysqlHandle()
         db.update(sql=update_sql)
         db.close()
         return True
     else:
         return False
def update_geo_data(uid, l_type, city_code, name):
    page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table"
    url = "http://map.baidu.com/?qt=ext&newmap=1&uid=%s&c=%d&nn=0&l=%d&ext_ver=new" % (
        uid, city_code, l_type)
    downloader = PageDownload()
    page = downloader.simple_download(url)
    if is_json(page):
        json_data = json.loads(page)
        if json_data.has_key("content"):
            content = json_data["content"]
            if content.has_key("geo"):
                geo = content["geo"]
                print(uid)
                md5 = to_md5(uid)
                sql = "update " + page_table + ' set geo="' + geo + '" where md5="' + md5 + '"'
                db = MysqlHandle()
                db.update(sql)
                db.close()
    time.sleep(random.uniform(0.5, 1.0))
示例#7
0
def init_spider(city_code, name, boundary):
    #Initializer(source="bdmap_api_"+name+"_"+str(city_code), table_config="table_config.json", filter_config=None, need_proxy=False)
    boundary_table = "bdmap_api_" + name + "_" + str(
        city_code) + "_boundary_table"
    lng_min, lat_min, lng_max, lat_max = boundary
    boundarys = split_boundary(float(lat_max), float(lat_min), float(lng_max),
                               float(lng_min), 10, 0.1)
    for _boundary in boundarys:
        _lng_min = _boundary[1][0]
        _lat_min = _boundary[0][0]
        _lng_max = _boundary[1][1]
        _lat_max = _boundary[0][1]
        _boundary_st = str(_boundary[0][0]) + "," + str(
            _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str(
                _boundary[1][1])
        md5 = to_md5(_boundary_st)
        db = MysqlHandle()
        sql = "insert into " + boundary_table + " values(%s,%s,1,%s,%s,%s,%s,0,0,now())"
        db.insert(
            sql, [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]])
        db.close()
示例#8
0
 def download_list_page(self, urlmd5, url, proxy, domain=None):
     downloader = PageDownload(proxy=proxy,timeout=10)
     page = downloader.simple_download(url=url)
     if page is not None:
         new_urls = re.findall(self.reg, page)
         # singer_names = re.findall(self.js0_reg, page)
         # for singer_name in singer_names:
         #     merge_url = "http://www.51ape.com/skin/ape/php/qx_2.php?qx=" + singer_name
         #     new_urls.append(merge_url)
         for _url in new_urls:
             if domain is not None:
                 if _url.startswith("/"):
                     new_url = domain + _url
                 else:
                     new_url = _url
             else:
                 new_url = _url
             url_type = self.filter_url(url=new_url)
             if url_type is not None:
                 new_urlmd5 = to_md5(in_str=new_url)
                 sql = "select * from  "+self.url_table+" where urlmd5='%s'" % (new_urlmd5)
                 db = MysqlHandle()
                 results = db.query(sql=sql)
                 db.close()
                 if not results:
                     db = MysqlHandle()
                     insert_sql = "insert into "+self.url_table+" values (%s,%s,%s,%s,now())"
                     db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)])
                     db.close()
                 else:
                     print "This url is already in the database!!"
             else:
                 pass
         update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5)
         db = MysqlHandle()
         db.update(sql=update_sql)
         db.close()
         return True
     else:
         return False