def add_init_url(self, url_table_name, filter_config, city_code, keyword): list_url = 'https://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=spot&from=webmap&c=%d&wd=%s&wd2=&pn=0&nn=0&db=0&sug=0&addr=0&pl_data_type=life&pl_sort_type=data_type&pl_sort_rule=0&pl_business_type=cinema&pl_business_id=&da_src=pcmappg.poi.page&on_gel=1&src=7&gr=3&l=12&rn=10&tn=B_NORMAL_MAP&ie=utf-8&b=(%s)' url = filter_config["url"] # db = MysqlHandle() # insert_sql = "insert into " + url_table_name + " values(%s,%s,%s,%s,%s,now())" # db.insert(sql=insert_sql, value_list=[(url["urlmd5"], url["url"], url["type"], url["boundary"], url["status"])]) # db.close() boundary = url["boundary"] min_interval = boundary.split(";")[0] max_interval = boundary.split(";")[1] lat_min = min_interval.split(",")[1] lat_max = max_interval.split(",")[1] lng_min = min_interval.split(",")[0] lng_max = max_interval.split(",")[0] boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 20, 0.2) for _boundary in boundarys: _boundary_st = str(_boundary[1][0]) + "," + str( _boundary[0][0]) + ";" + str(_boundary[1][1]) + "," + str( _boundary[0][1]) new_url = list_url % (city_code, keyword, _boundary_st) new_urlmd5 = to_md5(in_str=new_url) url_type = 2 boundary = _boundary_st status = 0 db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)]) db.close()
def split_boundary_outline(b_type, city_code, name): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" sql = 'select lng_min,lat_min,lng_max,lat_max from ' + boundary_table + " where type=" + str( b_type) + " and total_count=400" db = MysqlHandle() query_res = db.query(sql) for (lng_min, lat_min, lng_max, lat_max) in query_res: boundarys = split_boundary(float(lat_max), float(lat_min), float(lng_max), float(lng_min), 10, 0.1) for _boundary in boundarys: _lng_min = _boundary[1][0] _lat_min = _boundary[0][0] _lng_max = _boundary[1][1] _lat_max = _boundary[0][1] _boundary_st = str(_boundary[0][0]) + "," + str( _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str( _boundary[1][1]) md5 = to_md5(_boundary_st) db = MysqlHandle() sql = "insert into " + boundary_table + " values(%s,%s,2,%s,%s,%s,%s,0,0,now())" db.insert( sql, [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]]) db.close()
def download_info_page(self, e_id): page_url = self.page_base_url % (e_id) print page_url downloader = PageDownload() page = downloader.simple_download(page_url) if page: site = re.findall(r"<br />网址:([^<]+)<br />",page) if site: site = site[0] else: site = None company_name = re.findall(r"<strong>([^<]+)</strong>",page) if company_name: company_name = company_name[0] else: company_name = None zw_num = re.findall(r'<span class="glyphicon glyphicon-envelope"></span> 展位号: (\w+)',page) if zw_num: zw_num = zw_num[0] else: zw_num = None mail = re.findall(r'</span> 邮箱:<a href="mailto:([^<]+@[^<]+)">[^<]+@[^<]+</a>',page) if mail: mail = mail[0] else: mail = None db = MysqlHandle() sql = "insert into diecast VALUES (%s,%s,%s,%s)" db.insert(sql,[(site,company_name,zw_num,mail)]) db.close()
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy, timeout=5) page = downloader.simple_download(url=url) if page is not None: company_name = self.extract_field_from_page( page=page, reg=r"""<meta name="keywords" content="([^=]+)"/>""", index=-1) company_net = self.extract_field_from_page( page=page, reg= r'"/companyurlimg.php\?url=([^<]+)" alt="myImage" style="border:none' ) address = self.extract_field_from_page(page=page.decode("utf-8"), reg="<p>([^=]+)</p>", index=-1) # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, company_name, company_net, address)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<h1>([^<]+)\[FLAC格式\]下载</h1>') if file_name is None: file_name = self.extract_field_from_page( page=page, reg=r'<h1>([^<]+)下载?</h1>') baiduyun_url = self.extract_field_from_page( page=page, reg=r'<meta name="description" content="' r'[^<]*(https?://pan.baidu.com/[^\s]+) [^<]+"/>') baiduyun_password = self.extract_field_from_page( page=page, reg=r'<meta name="description" content' r'="[^<]+密码[\W]+(\w+)..."/>') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def get_page_url(line_name, line_type, city_code, coords): base_url = 'http://map.baidu.com/?newmap=1&reqflag=pcmap&biz=1&from=webmap&da_par=direct&pcevaname=pc4.1&qt=bl&da_src=searchBox.button&wd=%s&c=%d&l=13&b=(%s)&from=webmap&sug_forward=&tn=B_NORMAL_MAP&nn=0' downloader = PageDownload(timeout=5) page = downloader.simple_download(base_url % (line_name, city_code, coords)) if is_json(page): json_data = json.loads(page) if not json_data.has_key('content'): print base_url return contents = json_data['content'] line_list = [] for item in contents: name = item['name'] if not item.has_key("uid"): print name, base_url continue uid = item['uid'] page_url = 'http://map.baidu.com/?qt=bsl&tps=&newmap=1&uid=' + uid + '&c=%d' % ( city_code) line_list.append((name, uid, page_url, line_type)) db = MysqlHandle() insert_sql = "insert into baidu_busline_url_analyse values(%s,%s,%s,%s,0)" db.insert(insert_sql, line_list) db.close()
def download_page(self, base_url, geo, proxy, city_code, sec, page_table): downloader = PageDownload(proxy=proxy, hd=mobike_headers) post_dic = { 'longitude': str(geo[0]), 'latitude': str(geo[1]), 'citycode': str(city_code), 'errMsg': 'getMapCenterLocation:ok' } page = downloader.download_with_post(url=base_url, post_data=post_dic) if is_json(page): json_page = json.loads(page) if json_page.has_key("object"): mobike_object = json_page["object"] items = [] for mobike in mobike_object: bike_id = mobike["distId"] bike_type = mobike["biketype"] b_type = mobike["type"] lng = mobike["distX"] lat = mobike["distY"] dis_source = str(geo[0]) + "," + str(geo[1]) item = (bike_id, bike_type, b_type, lat, lng, dis_source, sec) items.append(item) db = MysqlHandle() sql = "insert into " + page_table + " values(%s,%s,%s,%s,%s,%s,%s,now())" db.insert(sql=sql, value_list=items) return True else: return False
def download_page(self, urlmd5, url, proxy, url_table, page_table): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<h1 class="yh mt_1 f_32">([^<]+\.[a-z]+)</h1>') file_size = self.extract_field_from_page( page=page, reg=r'<h3 class="c999 fl mt_05 f_12 n yh">' r'<em class="n ml_1 mr_1">·</em>(\d+\.?\d+M)</h3>') baiduyun_url = self.extract_field_from_page( page=page, reg=r'href="(https?://pan.baidu.com/[^\s]+)"') baiduyun_password = self.extract_field_from_page( page=page, reg=r'<em class="dn"></em>密码:(\w+)</b>') sql = "insert into " + page_table + " values (%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, file_size, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<li class="fl ml_1 mt_08 c999">([^=]+)</li>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page( page=page, reg=r'<li><a class="fl c3b ml_1 mt_08" href="http:' r'//www.51ape.com/[^=]+/" title="[^=]+">([^=]+)' r'</a></li>') baiduyun_url = self.extract_field_from_page( page=page, reg=r'href="(https?://pan.baidu.com/s/[^=]+)"') baiduyun_password = self.extract_field_from_page( page=page, reg=r'提取<em class="dn"></em>密码:(\w+)</b>') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: #print page.decode("utf-8") file_name = self.extract_field_from_page(page=page, reg=r'专辑名称:([^<]+)') if file_name is None: file_name = self.extract_field_from_page(page=page, reg=r'<h1 class="title">([^<]+)</h1>') music_type = self.extract_field_from_page(page=page, reg=r' <a href="/\w+/">([^<]+)</a>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page(page=page, reg=r'专辑艺人:([^<]+)') baiduyun_url = self.extract_field_from_page(page=page, reg=r"""<a href="#ecms" onclick="window.open\('([^<]+)','','width=300,height=300,resizable=yes'\)""") print baiduyun_url if baiduyun_url is None: return False if baiduyun_url is not None: baiduyun_url = self.domain+baiduyun_url baiduyun_password = self.extract_field_from_page(page=page, reg=r'密码: (\w+)') sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name,music_type, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def get_info(urlmd5, url, driver): driver.get(url) page = driver.page_source.encode("utf-8") # print page contacts = re.findall(r"<strong>联系人</strong>:([^=]+)<a", page) if contacts: contacts = contacts[0] else: contacts = None e_mail = re.findall(r'<a href="mailto:([^"]+@[^"]+)"', page) if e_mail: e_mail = e_mail[0] else: e_mail = None phone_num = re.findall(r"<p><strong>电话</strong>: ([^/]+)</p>", page) if phone_num: phone_num = phone_num[0] else: phone_num = None #print phone_num db = MysqlHandle() sql = "insert into cphi_info_table values(%s,%s,%s,%s,%s)" print e_mail db.insert(sql, [(urlmd5, url, contacts, e_mail, phone_num)]) db.close()
def download_list_page(self, urlmd5, url, proxy, boundary, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if is_json(page): json_page = json.loads(page) result = json_page["result"] total_count = result["total"] print ("total:"+str(total_count)) if int(total_count) <= 10 and int(total_count)>0: content = json_page["content"] for item in content: uid = item["uid"] primary_uid = item["primary_uid"] new_url = self.page_url % (uid, primary_uid) new_urlmd5 = to_md5(in_str=new_url) url_type = 0 boundary = None status = 0 sql = "select * from " + self.url_table + " where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary,status)]) db.close() else: print "This url is already in the database!!" elif int(total_count) <= 0: pass else: min_interval = boundary.split(";")[0] max_interval = boundary.split(";")[1] lat_min = min_interval.split(",")[1] lat_max = max_interval.split(",")[1] lng_min = min_interval.split(",")[0] lng_max = max_interval.split(",")[0] boundarys = split_boundary(int(float(lat_max)), int(float(lat_min)), int(float(lng_max)), int(float(lng_min)), 4, 0.2) for _boundary in boundarys: _boundary_st = str(_boundary[1][0])+","+str(_boundary[0][0])+";"+str(_boundary[1][1])+","+str(_boundary[0][1]) new_url = self.list_url % (self.city_code,self.keyword, _boundary_st) new_urlmd5 = to_md5(in_str=new_url) url_type = 1 boundary = _boundary_st status = 0 db = MysqlHandle() insert_sql = "insert into " + self.url_table + " values (%s,%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, boundary, status)]) db.close() update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def add_filter_urls(self, filter_table_name, filter_config): filters = filter_config["filters"] value_list = [] for filter in filters: value_list.append((filter["type"], filter["filter"])) db = MysqlHandle() insert_sql = "insert into "+filter_table_name+" values(%s,%s)" db.insert(sql=insert_sql, value_list=value_list) db.close()
def spider(city_code, name, keyword, key_token): boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" page_table = "bdmap_api_" + name + "_" + str(city_code) + "_page_table" base_url = "http://api.map.baidu.com/place/v2/search?query=%s&scope=2&bounds=%s&output=json&ak=%s&page_num=%d" sql = "select md5, boundary from " + boundary_table + " where status=0" db = MysqlHandle() res_data = db.query(sql) for (md5, boundary) in res_data: url = base_url % (keyword, boundary, key_token, 0) downloader = PageDownload() page = downloader.simple_download(url) if is_json(page): json_data = json.loads(page) status = json_data["status"] total = json_data["total"] print(boundary, url, total) if status == 0 and int(total) > 0: page_count = int(total) / 10 for x in range(0, page_count + 2): _url = base_url % (keyword, boundary, key_token, x) downloader = PageDownload() _page = downloader.simple_download(_url) if is_json(_page): _json_data = json.loads(_page) results = _json_data["results"] for item in results: name = item["name"] address = item["address"] province = item["province"] city = item["city"] area = item["area"] uid = item["uid"] _md5 = to_md5(uid) lat = item["location"]["lat"] lng = item["location"]["lng"] try: tag = item["detail_info"]["tag"] except Exception, e: tag = None print(e.message) sql = "insert into " + page_table + " values(%s,%s,%s,%s,null,%s,%s,%s,%s,%s,null,null,%s,null,now(),null)" db = MysqlHandle() db.insert(sql, [[ _md5, uid, name, address, province, city, area, lng, lat, tag ]]) db.close() sql = 'update ' + boundary_table + ' set status=200,total_count=' + str( total) + ' where md5="' + md5 + '"' db = MysqlHandle() db.update(sql) db.close()
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if is_json(page): page_json = json.loads(page) content = page_json["content"] uid = content["uid"] name = content["name"] address = content["addr"] if content.has_key("phone"): phone = content["phone"] else: phone = None x = content["navi_x"] y = content["navi_y"] # geo = content["geo"] ext = content["ext"] if type(ext) == type({"ee":""}): detail_info = ext["detail_info"] else: detail_info = {"info":""} if detail_info.has_key("tag"): tag = detail_info["tag"] else: tag = None if detail_info.has_key("image"): image = detail_info["image"] else: image = None if detail_info.has_key("display_info_redu"): dispaly_redu = detail_info["display_info_redu"] else: dispaly_redu = None if detail_info.has_key("price"): price = detail_info["price"] else: price = None sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,%s,%s,null,null,null,%s,%s,%s,null,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, uid, name, address, phone, x, y, tag, image, price, dispaly_redu)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(): db = MysqlHandle() query_sql = "select uid,min(name),min(line_type), min(page_url) from baidu_busline_url_analyse where status=0 group by uid " page_infs = db.query(query_sql) db.close() downloader = PageDownload() for item in page_infs: print(item[0]) page = downloader.simple_download(item[3]) # if is_json(page): # json_page = json.loads(page) # if json_page.has_key("content"): # main_info = json_page["content"][0] # name = main_info["name"] # timeable = main_info["timeable"] db = MysqlHandle() is_success = False if page is not None: insert_sql = "insert into baidu_busline_page values(%s,%s,%s,%s,NULL )" is_success = db.insert(insert_sql, [(item[0], item[1], item[2], page)]) if is_success and page is not None: update_sql = "update baidu_busline_url_analyse set status=200 where uid='%s'" % ( item[0]) db.update(update_sql) db.close()
def download_list_page(self, urlmd5, url, proxy, url_table, filter_table, domain=None): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url, filter_table=filter_table) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from " + url_table + " where urlmd5='%s'" % ( new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into " + url_table + " values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update " + url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def init_spider(city_code, name, boundary): #Initializer(source="bdmap_api_"+name+"_"+str(city_code), table_config="table_config.json", filter_config=None, need_proxy=False) boundary_table = "bdmap_api_" + name + "_" + str( city_code) + "_boundary_table" lng_min, lat_min, lng_max, lat_max = boundary boundarys = split_boundary(float(lat_max), float(lat_min), float(lng_max), float(lng_min), 10, 0.1) for _boundary in boundarys: _lng_min = _boundary[1][0] _lat_min = _boundary[0][0] _lng_max = _boundary[1][1] _lat_max = _boundary[0][1] _boundary_st = str(_boundary[0][0]) + "," + str( _boundary[1][0]) + "," + str(_boundary[0][1]) + "," + str( _boundary[1][1]) md5 = to_md5(_boundary_st) db = MysqlHandle() sql = "insert into " + boundary_table + " values(%s,%s,1,%s,%s,%s,%s,0,0,now())" db.insert( sql, [[md5, _boundary_st, _lng_min, _lat_min, _lng_max, _lat_max]]) db.close()
def download_list_page(self, urlmd5, url, proxy, domain=None): downloader = PageDownload(proxy=proxy,timeout=10) page = downloader.simple_download(url=url) if page is not None: new_urls = re.findall(self.reg, page) # singer_names = re.findall(self.js0_reg, page) # for singer_name in singer_names: # merge_url = "http://www.51ape.com/skin/ape/php/qx_2.php?qx=" + singer_name # new_urls.append(merge_url) for _url in new_urls: if domain is not None: if _url.startswith("/"): new_url = domain + _url else: new_url = _url else: new_url = _url url_type = self.filter_url(url=new_url) if url_type is not None: new_urlmd5 = to_md5(in_str=new_url) sql = "select * from "+self.url_table+" where urlmd5='%s'" % (new_urlmd5) db = MysqlHandle() results = db.query(sql=sql) db.close() if not results: db = MysqlHandle() insert_sql = "insert into "+self.url_table+" values (%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(new_urlmd5, new_url, url_type, 0)]) db.close() else: print "This url is already in the database!!" else: pass update_sql = "update "+self.url_table+" set status=200 where urlmd5='%s'" % (urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def download_page(self, urlmd5, url, proxy): downloader = PageDownload(proxy=proxy) page = downloader.simple_download(url=url) if page is not None: file_name = self.extract_field_from_page( page=page, reg=r'<h2>([^\]]+) <span class="c?b?">' r'WAV</span>') # if file_name is None: # file_name = self.extract_field_from_page(page=page, reg=r'<h1>([^<]+)下载?</h1>') singer_name = self.extract_field_from_page( page=page, reg=r'<p><a href="/detail/\d+.html" style="color:' r'#217fbc">([^=]+)</a></p>') result = self.extract_field_from_page( page=page, reg= r'<p class="downurl">链接: (https?://pan.baidu.com/s/[^=]+) 密码: (\w+)</p>' ) if result is not None: [baiduyun_url, baiduyun_password] = result else: baiduyun_url = None baiduyun_password = None sql = "insert into " + self.page_table + " values (%s,%s,%s,%s,%s,%s,now())" db = MysqlHandle() db.insert(sql=sql, value_list=[(urlmd5, url, file_name, singer_name, baiduyun_url, baiduyun_password)]) db.close() update_sql = "update " + self.url_table + " set status=200 where urlmd5='%s'" % ( urlmd5) db = MysqlHandle() db.update(sql=update_sql) db.close() return True else: return False
def district_table(table_name): query_sql = 'select distinct proxy from '+table_name db = MysqlHandle() proxys = db.query(query_sql) db.close() delete_sql = 'delete from '+table_name db = MysqlHandle() db.delete(delete_sql) db.close() db = MysqlHandle() insert_sql = 'insert into '+table_name+' values (%s,now(),0,100)' is_success = db.insert(insert_sql, proxys) if is_success: print u'The filtering has finished!' db.close()
def GET(self): inputs = web.input() if inputs.has_key("token"): token = inputs["token"] if token == "whanys": if inputs.has_key("username") and inputs.has_key("password"): username = inputs["username"] password = inputs["password"] sql = "select * from user_table_yixin where username='******'" db = MysqlHandle() res = db.query(sql) if res: result = { "status": "1", "msg": "failed,the username is already exist!" } else: db = MysqlHandle() sql = "insert into user_table_yixin values(%s,%s,now(),%s)" res = db.insert(sql, [(username, password, 0)]) if res: result = {"status": "0", "msg": "success"} else: result = { "status": "1", "msg": "failed,parameters not enough!" } else: result = { "status": "1", "msg": "failed,your token is not true!" } else: result = {"status": "1", "msg": "failed,you need a token!"} return result
def download(division, d_type, abb_name=None): base_url = "http://xzqh.mca.gov.cn/defaultQuery?shengji=%s&diji=%s&xianji=%s" webSelenium = WebSelenium() if d_type == 1: url = base_url % (urllib.quote( (division + "(" + abb_name + ")").encode("gb2312")), "-1", "-1") driver = webSelenium.simple_download(url, "chrome") print url rows = driver.find_elements_by_xpath( "/html/body/div[@id='center']/div[@class='mid_con_qt']/table[@class='info_table']/tbody/tr[@class='shi_nub']" ) for row in rows: c_division = row.find_element_by_xpath( "td[@class='name_left']/a[@class='name_text']").text population = row.find_element_by_xpath("td[3]").text area = row.find_element_by_xpath("td[4]").text code = row.find_element_by_xpath("td[5]").text zone = row.find_element_by_xpath("td[6]").text zip_code = row.find_element_by_xpath("td[7]").text if population == u'': population = None if area == u'': area = None if code == u'': code = None if zone == u'': zone = None if zip_code == u'': zip_code = None print(c_division, population, area, code, zone, zip_code) db = MysqlHandle() sql = "insert into divisions values(%s,%s,NULL ,%s,%s,%s,%s,%s,%s,0)" is_ok = db.insert(sql, [(c_division, code, 2, population, area, zone, zip_code, division)]) db.close() if is_ok: db = MysqlHandle() sql = 'update divisions set status=200 where division="' + division + '" and type=' + str( d_type) db.update(sql) elif d_type == 2: db = MysqlHandle() #sql = 'select parent from divisions where division="'+division+'"' sql = 'SELECT division, abb_name FROM divisions where division in (select parent from divisions where division="' + division + '")' res = db.query(sql) parent_division = res[0][0] + "(" + res[0][1] + ")" url = base_url % (urllib.quote(parent_division.encode("gb2312")), urllib.quote(division.encode("gb2312")), "-1") driver = webSelenium.simple_download(url, "chrome") print url rows = driver.find_elements_by_xpath( "/html/body/div[@id='center']/div[@class='mid_con_qt']/table[@class='info_table']/tbody/tr" ) for row in rows[2:]: c_division = row.find_element_by_xpath( "td[@class='name_left']").text population = row.find_element_by_xpath("td[3]").text if population == u'': population = None area = row.find_element_by_xpath("td[4]").text if area == u'': area = None code = row.find_element_by_xpath("td[5]").text if code == u'': code = None zone = row.find_element_by_xpath("td[6]").text if zone == u'': zone = None zip_code = row.find_element_by_xpath("td[7]").text if zip_code == u'': zip_code = None print(c_division, population, area, code, zone, zip_code) db = MysqlHandle() sql = "insert into divisions values(%s,%s,NULL ,%s,%s,%s,%s,%s,%s,0)" is_ok = db.insert(sql, [(c_division, code, 3, population, area, zone, zip_code, division)]) if is_ok: db = MysqlHandle() sql = 'update divisions set status=200 where division="' + division + '" and type=' + str( d_type) db.update(sql) else: pass
def run(self): proxy = self.proxy_mgr.get_proxy()[1] time.sleep(2) s_time = time.time() # start time sl_time = time.time() # start time of a proxy while 1: tuple_from_queue = self.queue.get() if tuple_from_queue is not None: try: self.lock.acquire() (num, origin, destination) = tuple_from_queue url = "http://restapi.amap.com/v3/direction/transit/integrated?origin=%s&" \ "destination=%s&city=%d&output=json&key=%s" % (origin, destination, 131, self.key) # print url downloader = PageDownload() page = downloader.simple_download(url) print num, origin, destination self.count += 1 if page is not None: if is_json(page): json_data = json.loads(page) if json_data["status"] == "1": route = json_data['route'] transits = route["transits"] distance = None duration = None cost = None for transit in transits: distance = float(transit["distance"]) distance = round(distance / 1000, 2) duration = float(transit['duration']) / 3600 duration = round(duration, 2) cost = transit['cost'] if type(cost) is not list: cost = float(transit['cost']) cost = round(cost, 2) break else: continue db = MysqlHandle() # print "insert into table " sql = "insert into amap_busline_route VALUES (%s,%s,%s,%s,%s,%s)" db.insert(sql=sql, value_list=[(num, origin, destination, distance, duration, cost)]) db.close() else: if json_data["info"] == "DAILY_QUERY_OVER_LIMIT": # the limit use of a day print "key: " + self.key + " use out" break else: print json_data["info"] else: print "result is not json format" self.queue.put_nowait((num, origin, destination)) else: self.queue.put_nowait((num, origin, destination)) print "the page is None" time.sleep(2) self.lock.release() e_time = time.time() if self.count == 50: # make sure that we will not get move than 50 result in a minutes if e_time - s_time < 60: time.sleep(60 - e_time + s_time) s_time = time.time() if e_time - sl_time > 300: # if a proxy ip is used for more than 6 minutes,we will change it! proxy = self.proxy_mgr.change_proxy(proxy)[1] print "proxy has changed to: " + proxy sl_time = time.time() except Exception: with open("error.txt", "a") as f: f.write(str(tuple_from_queue[0]) + "\n") self.queue.put_nowait(tuple_from_queue) else: print 'queue is empty,please wait' time.sleep(10)
def add_init_url(self, url_table_name, filter_config, city_code, keyword): url = filter_config["url"] db = MysqlHandle() insert_sql = "insert into " + url_table_name + " values(%s,%s,%s,%s,now())" db.insert(sql=insert_sql, value_list=[(url["urlmd5"], url["url"], url["type"], url["status"])]) db.close()
def insert_to_table(self): urls = self.transform() sql = "insert into carflac_transform_table values(%s,%s,%s)" db = MysqlHandle() db.insert(sql=sql,value_list=urls) db.close()
"source": "-5", "source-version": "10005" }, boundary='----ofo-boundary-MC40MjcxMzUw' ) page = downloader.download_with_post(url=TEST_URL,post_data=post_data) if page is not None: AVALIABLE_IPS.append(ip) print ip[0]+" is ok!" else: pass except Exception, e: print str(e) pass db = MysqlHandle() db.insert('INSERT INTO TEMP_IPS_MANAGE VALUES (%s,now(),0,100)', AVALIABLE_IPS) db.close() district_table('TEMP_IPS_MANAGE') # 去除重复代理ip def district_table(table_name): query_sql = 'select distinct proxy from '+table_name db = MysqlHandle() proxys = db.query(query_sql) db.close() delete_sql = 'delete from '+table_name db = MysqlHandle() db.delete(delete_sql) db.close() db = MysqlHandle()