Exemplo n.º 1
0
 def download_page(self, urlmd5, url, proxy):
     downloader = PageDownload(proxy=proxy)
     page = downloader.simple_download(url=url)
     if page is not None:
         file_name = self.extract_field_from_page(
             page=page, reg=r'<li class="fl ml_1 mt_08 c999">([^=]+)</li>')
         # if file_name is None:
         #     file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>')
         singer_name = self.extract_field_from_page(
             page=page,
             reg=r'<li><a class="fl c3b ml_1 mt_08" href="http:'
             r'//www.51ape.com/[^=]+/" title="[^=]+">([^=]+)'
             r'</a></li>')
         baiduyun_url = self.extract_field_from_page(
             page=page, reg=r'href="(https?://pan.baidu.com/s/[^=]+)"')
         baiduyun_password = self.extract_field_from_page(
             page=page, reg=r'提取<em class="dn"></em>密码:(\w+)</b>')
         sql = "insert into " + self.page_table + "  values (%s,%s,%s,%s,%s,%s,now())"
         db = MysqlHandle()
         db.insert(sql=sql,
                   value_list=[(urlmd5, url, file_name, singer_name,
                                baiduyun_url, baiduyun_password)])
         db.close()
         update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (
             urlmd5)
         db = MysqlHandle()
         db.update(sql=update_sql)
         db.close()
         return True
     else:
         return False
Exemplo n.º 2
0
    def download_page(self, urlmd5, url, proxy):
        downloader = PageDownload(proxy=proxy)
        page = downloader.simple_download(url=url)
        if page is not None:
            #print page.decode("utf-8")
            file_name = self.extract_field_from_page(page=page, reg=r'专辑名称:([^<]+)')
            if file_name is None:
                file_name = self.extract_field_from_page(page=page, reg=r'<h1 class="title">([^<]+)</h1>')
            music_type = self.extract_field_from_page(page=page, reg=r'&nbsp;<a href="/\w+/">([^<]+)</a>')

            # if file_name is None:
            #     file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>')
            singer_name = self.extract_field_from_page(page=page, reg=r'专辑艺人:([^<]+)')
            baiduyun_url = self.extract_field_from_page(page=page, reg=r"""<a href="#ecms" onclick="window.open\('([^<]+)','','width=300,height=300,resizable=yes'\)""")
            print baiduyun_url
            if baiduyun_url is None:
                return False
            if baiduyun_url is not None:
                baiduyun_url = self.domain+baiduyun_url
            baiduyun_password = self.extract_field_from_page(page=page, reg=r'密码: (\w+)')
            sql = "insert into " + self.page_table + "  values (%s,%s,%s,%s,%s,%s,%s,now())"
            db = MysqlHandle()
            db.insert(sql=sql, value_list=[(urlmd5, url, file_name,music_type, singer_name, baiduyun_url,
                                            baiduyun_password)])
            db.close()
            update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5)
            db = MysqlHandle()
            db.update(sql=update_sql)
            db.close()
            return True
        else:
            return False
Exemplo n.º 3
0
    def run(self):
        download_count = 0
        total_count = 0
        fail_count_20 = 0
        start_time = time.time()
        while 1:
            try:
                self.tuple_from_queue = self.queue.get()
                if self.tuple_from_queue is None:
                    print u'urls pool is empty,please wait for 90s'
                    time.sleep(90)
                    break
                else:
                    print self.tuple_from_queue
                    start_time_this_round = time.time()
                    download_count += 1
                    self.lock.acquire()
                    if not self.need_proxy:
                        self.proxy = None
                    download_result = self.spider.process(tuple_from_queue=self.tuple_from_queue, proxy=self.proxy)
                    self.lock.release()

                    total_count += download_result['total']
                    if download_result['total'] != download_result['success']:  # 如果爬取url没有全部成功,失败的次数相应增加
                        fail_count_20 += download_result['total'] - download_result['success']
                        for failed_data in download_result['failed_list']:  # 将失败的url记录再加入持久队列
                            self.queue.put(failed_data)
                            print failed_data
                batch = download_count % self.CHKTHD
                if batch == 0:
                    self.activity = 1 - float(fail_count_20) / float(total_count)
                    print '[%s]COUNT: %d, FAIL-IN-this%d:%d , avail:%f:' % (
                        self.proxy_ip, self.CHKTHD, total_count, fail_count_20,
                        self.activity)
                    fail_count_20 = 0
                    total_count = 0
                    if self.activity < 0.3:
                        print '[%s]rate of download is %f,too low' % (self.proxy_ip, self.activity)
                        db = MysqlHandle()
                        sql = "update temp_ips_manage set availabity=%s where proxy='%s'" % (
                        self.activity, self.proxy_ip)
                        db.update(sql=sql)
                        db.close()
                        # self.change_proxy()
                        self.proxy = {'http': 'http://' + self.proxy_manger.change_proxy(self.proxy_ip)[1]}
                spider_time = time.time() - start_time
                if spider_time > 600:
                    print '[%s]timeout,we will quit' % (self.proxy_ip)
                    self.proxy = {'http': 'http://' + self.proxy_manger.change_proxy(self.proxy_ip)[1]}
                    start_time = time.time()
                elaspsed = time.time() - start_time_this_round
                interval = random.randint(self.interval_down, self.interval_upp)
                if elaspsed < interval:
                    time.sleep(interval - elaspsed)
            except Exception,e:
                print e.message
                print self.tuple_from_queue
                if type(self.tuple_from_queue)==list and len(self.tuple_from_queue)==3:

                    self.queue.put(self.tuple_from_queue)
Exemplo n.º 4
0
    def download_page(self, urlmd5, url, proxy):
        downloader = PageDownload(proxy=proxy, timeout=5)
        page = downloader.simple_download(url=url)
        if page is not None:
            company_name = self.extract_field_from_page(
                page=page,
                reg=r"""<meta name="keywords" content="([^=]+)"/>""",
                index=-1)
            company_net = self.extract_field_from_page(
                page=page,
                reg=
                r'"/companyurlimg.php\?url=([^<]+)" alt="myImage" style="border:none'
            )
            address = self.extract_field_from_page(page=page.decode("utf-8"),
                                                   reg="<p>([^=]+)</p>",
                                                   index=-1)

            # if file_name is None:
            #     file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>')

            sql = "insert into " + self.page_table + "  values (%s,%s,%s,%s,%s,now())"
            db = MysqlHandle()
            db.insert(sql=sql,
                      value_list=[(urlmd5, url, company_name, company_net,
                                   address)])
            db.close()
            update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (
                urlmd5)
            db = MysqlHandle()
            db.update(sql=update_sql)
            db.close()
            return True
        else:
            return False
Exemplo n.º 5
0
def download_page():
    db = MysqlHandle()
    query_sql = "select uid,min(name),min(line_type), min(page_url) from baidu_busline_url_analyse where status=0 group by uid "
    page_infs = db.query(query_sql)
    db.close()
    downloader = PageDownload()

    for item in page_infs:
        print(item[0])
        page = downloader.simple_download(item[3])
        # if is_json(page):
        #     json_page = json.loads(page)
        #     if json_page.has_key("content"):
        #         main_info = json_page["content"][0]
        #         name = main_info["name"]
        #         timeable = main_info["timeable"]
        db = MysqlHandle()
        is_success = False
        if page is not None:
            insert_sql = "insert into baidu_busline_page values(%s,%s,%s,%s,NULL )"
            is_success = db.insert(insert_sql,
                                   [(item[0], item[1], item[2], page)])
        if is_success and page is not None:
            update_sql = "update baidu_busline_url_analyse set status=200 where uid='%s'" % (
                item[0])
            db.update(update_sql)
        db.close()
Exemplo n.º 6
0
 def download_page(self, urlmd5, url, proxy):
     downloader = PageDownload(proxy=proxy)
     page = downloader.simple_download(url=url)
     if page is not None:
         file_name = self.extract_field_from_page(
             page=page, reg=r'<h1>([^<]+)\[FLAC格式\]下载</h1>')
         if file_name is None:
             file_name = self.extract_field_from_page(
                 page=page, reg=r'<h1>([^<]+)下载?</h1>')
         baiduyun_url = self.extract_field_from_page(
             page=page,
             reg=r'<meta name="description" content="'
             r'[^<]*(https?://pan.baidu.com/[^\s]+) [^<]+"/>')
         baiduyun_password = self.extract_field_from_page(
             page=page,
             reg=r'<meta name="description" content'
             r'="[^<]+密码[\W]+(\w+)..."/>')
         sql = "insert into " + self.page_table + "  values (%s,%s,%s,%s,%s,now())"
         db = MysqlHandle()
         db.insert(sql=sql,
                   value_list=[(urlmd5, url, file_name, baiduyun_url,
                                baiduyun_password)])
         db.close()
         update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (
             urlmd5)
         db = MysqlHandle()
         db.update(sql=update_sql)
         db.close()
         return True
     else:
         return False
Exemplo n.º 7
0
 def download_page(self, urlmd5, url, proxy, url_table, page_table):
     downloader = PageDownload(proxy=proxy)
     page = downloader.simple_download(url=url)
     if page is not None:
         file_name = self.extract_field_from_page(
             page=page,
             reg=r'<h1 class="yh mt_1 f_32">([^<]+\.[a-z]+)</h1>')
         file_size = self.extract_field_from_page(
             page=page,
             reg=r'<h3 class="c999 fl mt_05 f_12 n yh">'
             r'<em class="n ml_1 mr_1">·</em>(\d+\.?\d+M)</h3>')
         baiduyun_url = self.extract_field_from_page(
             page=page, reg=r'href="(https?://pan.baidu.com/[^\s]+)"')
         baiduyun_password = self.extract_field_from_page(
             page=page, reg=r'<em class="dn"></em>密码:(\w+)</b>')
         sql = "insert into " + page_table + "  values (%s,%s,%s,%s,%s,%s,now())"
         db = MysqlHandle()
         db.insert(sql=sql,
                   value_list=[(urlmd5, url, file_name, file_size,
                                baiduyun_url, baiduyun_password)])
         db.close()
         update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % (
             urlmd5)
         db = MysqlHandle()
         db.update(sql=update_sql)
         db.close()
         return True
     else:
         return False
Exemplo n.º 8
0
    def download_list_page(self, urlmd5, url, proxy, boundary, domain=None):
        downloader = PageDownload(proxy=proxy)
        page = downloader.simple_download(url=url)
        if is_json(page):
            json_page = json.loads(page)
            result = json_page["result"]
            total_count = result["total"]
            print ("total:"+str(total_count))
            if int(total_count) <= 10 and int(total_count)>0:
                content = json_page["content"]
                for item in content:
                    uid = item["uid"]
                    primary_uid = item["primary_uid"]

                    new_url = self.page_url % (uid, primary_uid)
                    new_urlmd5 = to_md5(in_str=new_url)
                    url_type = 0
                    boundary = None
                    status = 0
                    sql = "select * from  " + self.url_table + " where urlmd5='%s'" % (new_urlmd5)
                    db = MysqlHandle()
                    results = db.query(sql=sql)
                    db.close()
                    if not results:
                        db = MysqlHandle()
                        insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())"
                        db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary,status)])
                        db.close()
                    else:
                        print "This url is already in the database!!"
            elif int(total_count) <= 0:
                pass
            else:
                min_interval = boundary.split(";")[0]
                max_interval = boundary.split(";")[1]
                lat_min = min_interval.split(",")[1]
                lat_max = max_interval.split(",")[1]
                lng_min = min_interval.split(",")[0]
                lng_max = max_interval.split(",")[0]

                boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 4, 0.2)
                for _boundary in boundarys:
                    _boundary_st = str(_boundary[1][0])+","+str(_boundary[0][0])+";"+str(_boundary[1][1])+","+str(_boundary[0][1])
                    new_url = self.list_url % (self.city_code,self.keyword, _boundary_st)
                    new_urlmd5 = to_md5(in_str=new_url)
                    url_type = 1
                    boundary = _boundary_st
                    status = 0
                    db = MysqlHandle()
                    insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())"
                    db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)])
                    db.close()
            update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5)
            db = MysqlHandle()
            db.update(sql=update_sql)
            db.close()
            return True
        else:
            return False
Exemplo n.º 9
0
def spider(city_code, name, keyword, key_token):
    boundary_table = "bdmap_api_" + name + "_" + str(
        city_code) + "_boundary_table"
    page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table"
    base_url = "http://api.map.baidu.com/place/v2/search?query=%s&scope=2&bounds=%s&output=json&ak=%s&page_num=%d"
    sql = "select md5, boundary from " + boundary_table + " where status=0"
    db = MysqlHandle()
    res_data = db.query(sql)
    for (md5, boundary) in res_data:
        url = base_url % (keyword, boundary, key_token, 0)
        downloader = PageDownload()
        page = downloader.simple_download(url)
        if is_json(page):
            json_data = json.loads(page)
            status = json_data["status"]
            total = json_data["total"]
            print(boundary, url, total)
            if status == 0 and int(total) > 0:
                page_count = int(total) / 10
                for x in range(0, page_count + 2):
                    _url = base_url % (keyword, boundary, key_token, x)
                    downloader = PageDownload()
                    _page = downloader.simple_download(_url)
                    if is_json(_page):
                        _json_data = json.loads(_page)
                        results = _json_data["results"]
                        for item in results:
                            name = item["name"]
                            address = item["address"]
                            province = item["province"]
                            city = item["city"]
                            area = item["area"]
                            uid = item["uid"]
                            _md5 = to_md5(uid)
                            lat = item["location"]["lat"]
                            lng = item["location"]["lng"]
                            try:
                                tag = item["detail_info"]["tag"]
                            except Exception, e:
                                tag = None
                                print(e.message)
                            sql = "insert into " + page_table + " values(%s,%s,%s,%s,null,%s,%s,%s,%s,%s,null,null,%s,null,now(),null)"
                            db = MysqlHandle()
                            db.insert(sql, [[
                                _md5, uid, name, address, province, city, area,
                                lng, lat, tag
                            ]])
                            db.close()

            sql = 'update ' + boundary_table + ' set status=200,total_count=' + str(
                total) + ' where md5="' + md5 + '"'
            db = MysqlHandle()
            db.update(sql)
            db.close()
Exemplo n.º 10
0
def insert_into_shp(shp, workspace, query_item):
    uid = query_item[0]
    name = query_item[1]
    geo_type = query_item[2]
    page = query_item[3]
    json_page = json.loads(page)
    if not json_page.has_key("content"):
        return
    content = json_page["content"]
    item_info = content[0]
    geo = item_info["geo"]
    _geo = geo.split("|")[2].strip(";")
    real_geo = "MULTILINESTRING("
    for segement in _geo.split(";"):
        real_geo = real_geo + "("
        los = segement.split(",")
        for i in range(0, len(los), 2):
            # if i>2 :
            #     if los[i]==los[i - 2] and los[i + 1]==los[i - 1]:
            #         continue
            real_geo = real_geo + los[i] + " " + los[i + 1] + ","
        real_geo = real_geo.strip(",") + "),"
    real_geo = real_geo.strip(",") + ")"

    timetable = item_info["timetable"]
    if timetable is None:
        timetable = ""
    price = int(item_info["ticketPrice"]) / 100.0
    current_city = json_page["current_city"]

    city = current_city["name"]
    if city is None:
        city = ""
    province = current_city["up_province_name"]
    if province is None:
        province = ""
    arcpy.env.workspace = workspace
    polyline = arcpy.FromWKT(real_geo)
    fields = [
        "UID", "NAME", "PROVINCE", "CITY", "GEO_TYPE", "TIMETABLE", "PRICE"
    ]
    fields.append("SHAPE@")
    values = [uid, name, province, city, geo_type, timetable, price, polyline]
    cursor = arcpy.da.InsertCursor(shp, fields)
    cursor.insertRow(values)
    del cursor

    db = MysqlHandle()
    sql = 'update baidu_busline_page set status=200 where uid="' + item[0] + '"'
    db.update(sql)
Exemplo n.º 11
0
    def download_page(self, urlmd5, url, proxy):
        downloader = PageDownload(proxy=proxy)
        page = downloader.simple_download(url=url)
        if is_json(page):
            page_json = json.loads(page)
            content = page_json["content"]
            uid = content["uid"]
            name = content["name"]
            address = content["addr"]
            if content.has_key("phone"):
                phone = content["phone"]
            else:
                phone = None

            x = content["navi_x"]
            y = content["navi_y"]
            # geo = content["geo"]
            ext = content["ext"]
            if type(ext) == type({"ee":""}):
                detail_info = ext["detail_info"]
            else:
                detail_info = {"info":""}
            if detail_info.has_key("tag"):
                tag = detail_info["tag"]
            else:
                tag = None
            if detail_info.has_key("image"):
                image = detail_info["image"]
            else:
                image = None
            if detail_info.has_key("display_info_redu"):
                dispaly_redu = detail_info["display_info_redu"]
            else:
                dispaly_redu = None
            if detail_info.has_key("price"):
                price = detail_info["price"]
            else:
                price = None

            sql = "insert into " + self.page_table + "  values (%s,%s,%s,%s,%s,%s,%s,%s,null,null,null,%s,%s,%s,null,%s,now())"
            db = MysqlHandle()
            db.insert(sql=sql, value_list=[(urlmd5, url, uid, name, address, phone, x, y, tag, image,  price, dispaly_redu)])
            db.close()
            update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5)
            db = MysqlHandle()
            db.update(sql=update_sql)
            db.close()
            return True
        else:
            return False
Exemplo n.º 12
0
 def download_list_page(self,
                        urlmd5,
                        url,
                        proxy,
                        url_table,
                        filter_table,
                        domain=None):
     downloader = PageDownload(proxy=proxy)
     page = downloader.simple_download(url=url)
     if page is not None:
         new_urls = re.findall(self.reg, page)
         for _url in new_urls:
             if domain is not None:
                 if _url.startswith("/"):
                     new_url = domain + _url
                 else:
                     new_url = _url
             else:
                 new_url = _url
             url_type = self.filter_url(url=new_url,
                                        filter_table=filter_table)
             if url_type is not None:
                 new_urlmd5 = to_md5(in_str=new_url)
                 sql = "select * from  " + url_table + " where urlmd5='%s'" % (
                     new_urlmd5)
                 db = MysqlHandle()
                 results = db.query(sql=sql)
                 db.close()
                 if not results:
                     db = MysqlHandle()
                     insert_sql = "insert into " + url_table + " values (%s,%s,%s,%s,now())"
                     db.insert(sql=insert_sql,
                               value_list=[(new_urlmd5, new_url, url_type,
                                            0)])
                     db.close()
                 else:
                     print "This url is already in the database!!"
             else:
                 pass
         update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % (
             urlmd5)
         db = MysqlHandle()
         db.update(sql=update_sql)
         db.close()
         return True
     else:
         return False
Exemplo n.º 13
0
    def start_feed(self):
        round_num = 0
        last_url_count = 0
        this_url_count = 0
        same_count = 0

        while self.run_sign:
            if len(self.queue) < 2500:
                start_time = time.time()
                round_index = round_num % 10
                db = MysqlHandle()
                url_list = db.query(self.sql)
                db.close()
                count = 0
                for url_data in url_list:
                    self.queue.put(
                        (url_data[0], url_data[1], url_data[2], url_data[3]))
                    update_sql = self.update_sql_base % url_data[0]
                    db = MysqlHandle()
                    db.update(update_sql)
                    db.close()
                    count += 1
                print 'FinishedQueue-' + self.url_table + ': %d TIME ELASPSED: %f ' % (
                    count, time.time() - start_time)

                if round_index == 0:
                    if this_url_count == last_url_count:
                        same_count += 1
                    else:
                        last_url_count = this_url_count
                        same_count = 0
                    this_url_count = 0
                else:
                    this_url_count += 1
                round_num += 1
            else:
                print 'The Queue is full!'
            if same_count == 100:
                print 'THE SAME NUM %d appeared for 10 rounds, feeding frequency turn down.'
                time.sleep(360)
                self.set_stop()
            else:
                time.sleep(5)
        print 'Existed Successfully!!'
Exemplo n.º 14
0
def update_geo_data(uid, l_type, city_code, name):
    page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table"
    url = "http://map.baidu.com/?qt=ext&newmap=1&uid=%s&c=%d&nn=0&l=%d&ext_ver=new" % (
        uid, city_code, l_type)
    downloader = PageDownload()
    page = downloader.simple_download(url)
    if is_json(page):
        json_data = json.loads(page)
        if json_data.has_key("content"):
            content = json_data["content"]
            if content.has_key("geo"):
                geo = content["geo"]
                print(uid)
                md5 = to_md5(uid)
                sql = "update " + page_table + ' set geo="' + geo + '" where md5="' + md5 + '"'
                db = MysqlHandle()
                db.update(sql)
                db.close()
    time.sleep(random.uniform(0.5, 1.0))
Exemplo n.º 15
0
 def download_list_page(self, urlmd5, url, proxy, domain=None):
     downloader = PageDownload(proxy=proxy,timeout=10)
     page = downloader.simple_download(url=url)
     if page is not None:
         new_urls = re.findall(self.reg, page)
         # singer_names = re.findall(self.js0_reg, page)
         # for singer_name in singer_names:
         #     merge_url = "http://www.51ape.com/skin/ape/php/qx_2.php?qx=" + singer_name
         #     new_urls.append(merge_url)
         for _url in new_urls:
             if domain is not None:
                 if _url.startswith("/"):
                     new_url = domain + _url
                 else:
                     new_url = _url
             else:
                 new_url = _url
             url_type = self.filter_url(url=new_url)
             if url_type is not None:
                 new_urlmd5 = to_md5(in_str=new_url)
                 sql = "select * from  "+self.url_table+" where urlmd5='%s'" % (new_urlmd5)
                 db = MysqlHandle()
                 results = db.query(sql=sql)
                 db.close()
                 if not results:
                     db = MysqlHandle()
                     insert_sql = "insert into "+self.url_table+" values (%s,%s,%s,%s,now())"
                     db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)])
                     db.close()
                 else:
                     print "This url is already in the database!!"
             else:
                 pass
         update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5)
         db = MysqlHandle()
         db.update(sql=update_sql)
         db.close()
         return True
     else:
         return False
Exemplo n.º 16
0
 def download_page(self, urlmd5, url, proxy):
     downloader = PageDownload(proxy=proxy)
     page = downloader.simple_download(url=url)
     if page is not None:
         file_name = self.extract_field_from_page(
             page=page,
             reg=r'<h2>([^\]]+)    				<span class="c?b?">'
             r'WAV</span>')
         # if file_name is None:
         #     file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>')
         singer_name = self.extract_field_from_page(
             page=page,
             reg=r'<p><a href="/detail/\d+.html" style="color:'
             r'#217fbc">([^=]+)</a></p>')
         result = self.extract_field_from_page(
             page=page,
             reg=
             r'<p class="downurl">链接: (https?://pan.baidu.com/s/[^=]+) 密码: (\w+)</p>'
         )
         if result is not None:
             [baiduyun_url, baiduyun_password] = result
         else:
             baiduyun_url = None
             baiduyun_password = None
         sql = "insert into " + self.page_table + "  values (%s,%s,%s,%s,%s,%s,now())"
         db = MysqlHandle()
         db.insert(sql=sql,
                   value_list=[(urlmd5, url, file_name, singer_name,
                                baiduyun_url, baiduyun_password)])
         db.close()
         update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (
             urlmd5)
         db = MysqlHandle()
         db.update(sql=update_sql)
         db.close()
         return True
     else:
         return False
Exemplo n.º 17
0
def insert_into_stations(shp, workspace, query_item):
    l_uid = query_item[0]
    name = query_item[1]
    page = query_item[3]
    json_page = json.loads(page)
    if not json_page.has_key("content"):
        return
    content = json_page["content"]
    item_info = content[0]
    stations = item_info["stations"]
    current_city = json_page["current_city"]

    city = current_city["name"]
    if city is None:
        city = ""
    province = current_city["up_province_name"]
    if province is None:
        province = ""
    for station in stations:
        station_name = station["name"]
        station_geo = station["geo"].strip(";").split("|")[-1].replace(
            ",", " ")
        geo_str = "POINT(%s)" % (station_geo)
        station_uid = station["uid"]
        arcpy.env.workspace = workspace
        point = arcpy.FromWKT(geo_str)
        fields = ["UID", "NAME", "PROVINCE", "CITY", "L_UID", "L_NAME"]
        fields.append("SHAPE@")
        values = [
            station_uid, station_name, province, city, l_uid, name, point
        ]
        cursor = arcpy.da.InsertCursor(shp, fields)
        cursor.insertRow(values)
        del cursor
    db = MysqlHandle()
    sql = 'update baidu_busline_page set status=200 where uid="' + item[0] + '"'
    db.update(sql)
Exemplo n.º 18
0
def download(division, d_type, abb_name=None):
    base_url = "http://xzqh.mca.gov.cn/defaultQuery?shengji=%s&diji=%s&xianji=%s"
    webSelenium = WebSelenium()
    if d_type == 1:
        url = base_url % (urllib.quote(
            (division + "(" + abb_name + ")").encode("gb2312")), "-1", "-1")
        driver = webSelenium.simple_download(url, "chrome")
        print url
        rows = driver.find_elements_by_xpath(
            "/html/body/div[@id='center']/div[@class='mid_con_qt']/table[@class='info_table']/tbody/tr[@class='shi_nub']"
        )
        for row in rows:
            c_division = row.find_element_by_xpath(
                "td[@class='name_left']/a[@class='name_text']").text
            population = row.find_element_by_xpath("td[3]").text
            area = row.find_element_by_xpath("td[4]").text
            code = row.find_element_by_xpath("td[5]").text
            zone = row.find_element_by_xpath("td[6]").text
            zip_code = row.find_element_by_xpath("td[7]").text
            if population == u'':
                population = None
            if area == u'':
                area = None
            if code == u'':
                code = None
            if zone == u'':
                zone = None
            if zip_code == u'':
                zip_code = None
            print(c_division, population, area, code, zone, zip_code)
            db = MysqlHandle()
            sql = "insert into divisions values(%s,%s,NULL ,%s,%s,%s,%s,%s,%s,0)"
            is_ok = db.insert(sql, [(c_division, code, 2, population, area,
                                     zone, zip_code, division)])
            db.close()
            if is_ok:
                db = MysqlHandle()
                sql = 'update divisions set status=200 where division="' + division + '" and type=' + str(
                    d_type)
                db.update(sql)

    elif d_type == 2:
        db = MysqlHandle()
        #sql = 'select parent from divisions where division="'+division+'"'
        sql = 'SELECT division, abb_name FROM divisions where division in (select parent from divisions where division="' + division + '")'
        res = db.query(sql)
        parent_division = res[0][0] + "(" + res[0][1] + ")"
        url = base_url % (urllib.quote(parent_division.encode("gb2312")),
                          urllib.quote(division.encode("gb2312")), "-1")
        driver = webSelenium.simple_download(url, "chrome")
        print url
        rows = driver.find_elements_by_xpath(
            "/html/body/div[@id='center']/div[@class='mid_con_qt']/table[@class='info_table']/tbody/tr"
        )
        for row in rows[2:]:
            c_division = row.find_element_by_xpath(
                "td[@class='name_left']").text
            population = row.find_element_by_xpath("td[3]").text
            if population == u'':
                population = None
            area = row.find_element_by_xpath("td[4]").text
            if area == u'':
                area = None
            code = row.find_element_by_xpath("td[5]").text
            if code == u'':
                code = None
            zone = row.find_element_by_xpath("td[6]").text
            if zone == u'':
                zone = None
            zip_code = row.find_element_by_xpath("td[7]").text
            if zip_code == u'':
                zip_code = None
            print(c_division, population, area, code, zone, zip_code)
            db = MysqlHandle()
            sql = "insert into divisions values(%s,%s,NULL ,%s,%s,%s,%s,%s,%s,0)"
            is_ok = db.insert(sql, [(c_division, code, 3, population, area,
                                     zone, zip_code, division)])
            if is_ok:
                db = MysqlHandle()
                sql = 'update divisions set status=200 where division="' + division + '" and type=' + str(
                    d_type)
                db.update(sql)

    else:
        pass