def crawl2(self): for p in range(self.start_page, self.total_page + 1): time.sleep(random.randint(1, 3)) data = { "__VIEWSTATE": self.__VIEWSTATE, "__EVENTARGUMENT": p, "__EVENTTARGET": self.__EVENTTARGET, } print("crawling page {}".format(p)) headers = { "Content-Type": "application/x-www-form-urlencoded", "User-Agent": fake_useragent() } browser = requests.post(self.URL, headers=headers, data=urlencode(data)) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) trs = html.xpath('//table[@id="DataGrid1"]/tr') for n in range(1, len(trs)): tds = trs[n].xpath('.//td') data = { "name": tds[1].text_content().strip(), "area": tds[2].text_content().strip(), "addr": tds[3].text_content().strip(), "level": tds[4].text_content().strip(), } print(data) self.save2db(data) else: print("Error while crawling page {}".format(p))
def prepare2crawl(self, start): url = "http://www.cqwsjsw.gov.cn/wsfw/yjff.aspx?flag=sel&seldq=0&pageNo={}" # for p in range(start, 704): for p in range(704, 705): print("------ Processing page {} ------".format(p)) header = { "User-Agent": fake_useragent() } req = urllib.request.Request(url.format(p), headers=header, method="GET") resp = urllib.request.urlopen(req) if resp.status == 200: html = lxml.html.fromstring(resp.read().decode("gbk")) # html = lxml.html.fromstring(resp.read().decode("gb18030")) tables = html.xpath('//td[@height="34"]/table') for n in range(1, len(tables)): tds = tables[n].xpath('.//td') data = { "area": tds[1].text_content().strip(), "street_town": tds[2].text_content().strip(), "community_village": tds[3].text_content().strip(), "work_time": tds[4].text_content().strip(), "service_tel": tds[5].text_content().strip(), "complaint_tel": tds[6].text_content().strip(), } # print(data) self.save2db(data) else: print("Error processing page {}".format(p)) break print("------ Finish page {} ------".format(p)) time.sleep(random.randint(1, 3))
def crawl2(self, url): print("processing: {0}".format(url)) self.headers["User-Agent"] = fake_useragent() browser = requests.get(url, headers=self.headers) if browser.status_code == 200: # root = lxml.html.fromstring(browser.text) # lis = root.xpath('//ul[@class="wzsc_bgjd_cs_jbxx"]/li') root = BeautifulSoup(browser.text, 'lxml') lis = root.find('ul', class_="wzsc_bgjd_cs_jbxx").find_all("li") tp = lis[2].contents[1].strip('\r\n').replace('\t', '').replace(' ', '').strip('\r\n').replace('\r\n', ',') link = lis[7].find('a') website = link.get("href") if link else "#" route = root.find_all("div", class_="wzsc_bgjd_cs_p contoxt")[1].text.strip() data = { "name": str(lis[0].contents[1]) if len(lis[0].contents) > 1 else "", "license": str(lis[1].contents[1]) if len(lis[1].contents) > 1 else "", "type": tp, "addr": str(lis[3].contents[1]) if len(lis[3].contents) > 1 else "", "tel": str(lis[4].contents[1]) if len(lis[4].contents) > 1 else "", "zip": str(lis[5].contents[1]) if len(lis[5].contents) > 1 else "", "email": str(lis[6].contents[1]) if len(lis[6].contents) > 1 else "", "website": website, "route": route } self.save2db(data) else: print("Error when processing url: {0}".format(url)) time.sleep(random.randint(1, 3))
def crawl(self): for m in range(21, 98): print("====== Processing page {0} ======".format(m)) headers = {"User-Agent": fake_useragent()} browser = requests.get(self.url.format(m), headers=headers) if browser.status_code == 200: root = lxml.html.fromstring(browser.text) divs = root.xpath('//div[@class="info"]') for div in divs: data = {} data["office_name"] = div.xpath( "./h2")[0].text_content().strip() data["competent_bureau"] = div.xpath( "./p[1]")[0].text_content().strip().split(r':')[1] data["license"] = div.xpath( "./p[2]")[0].text_content().strip().split(r':')[1] line = div.xpath("./p[3]")[0].text_content().strip() tmp = re.split(r'\xa0\xa0\xa0\xa0', line) data["telephone"] = tmp[0].split(r':')[1] data["address"] = tmp[1].split(r':')[1] data["director"] = div.xpath( "./p[4]")[0].text_content().strip().split(r':')[1] self.save2db(data) else: print("Error when crawling page {0}".format(m)) time.sleep(random.randint(2, 6))
def crawl2(self, url): print("processing: {0}".format(url)) self.headers["User-Agent"] = fake_useragent() browser = requests.get(url, headers=self.headers) if browser.status_code == 200: root = lxml.html.fromstring(browser.text) tds = root.xpath('//div[@id="myTab0_Content0"]/table/tr/td') data = { "name": str(tds[1].text_content()).strip(), "license": str(tds[3].text_content()).strip(), "director": str(tds[5].text_content()).strip(), "representative": str(tds[7].text_content()).strip(), "business": str(tds[9].text_content()).strip(), "area": str(tds[11].text_content()).strip(), "admin_org": str(tds[13].text_content()).strip(), "addr": str(tds[15].text_content()).strip(), "fax": str(tds[17].text_content()).strip(), "zip": str(tds[19].text_content()).strip(), "tel": str(tds[21].text_content()).strip(), "num": str(tds[23].text_content()).strip(), } self.save2db(data) else: print("Error when processing url: {0}".format(url)) time.sleep(random.randint(1, 3))
def crawl(self): for m in range(1, 9500): print("====== Processing page {0} ======".format(m)) headers = {"User-Agent": fake_useragent()} browser = requests.get(self.url.format(m), headers=headers) if browser.status_code == 200: root = lxml.html.fromstring(browser.text) divs = root.xpath('//div[@class="lawyerinfo"]') data = { "name": divs[0].xpath("./h3")[0].text_content().strip(), "law_office": divs[0].xpath('./p[1]')[0].text_content().split(':')[1], "gender": divs[0].xpath('./p[2]')[0].text_content().split(':')[1], "license": divs[0].xpath('./p[3]')[0].text_content().split(':')[1], "category": divs[0].xpath('./p[4]')[0].text_content().split(':')[1], } self.save2db(data) else: print("Error when crawling page {0}".format(m)) time.sleep(random.randint(1, 2))
def crawl(self): for p in range(2, 39): print("====== Processing page {0} ======".format(p)) headers = {"User-Agent": fake_useragent()} browser = requests.get(self.url.format(p), headers=headers) if browser.status_code == 200: root = lxml.html.fromstring(browser.text) lis = root.xpath( '//div[@class="right_message"]/div[@class="right_hy"]/ul/li' ) for li in lis: data = { "name": li.xpath('.//div[@class="right_hy_into_name"]') [0].text_content().strip(), "addr": li.xpath('.//div[@class="right_hy_into_zydz"]/span[1]') [0].text_content().strip(), "tel": li.xpath('.//div[@class="right_hy_into_zydz"]/span[2]') [0].text_content().strip(), "operation": li.xpath('.//div[@class="right_hy_into_zydz"]/span[3]') [0].text_content().strip(), } self.save2db(data) else: print("Error when crawling page {0}".format(p)) time.sleep(random.randint(1, 2))
def prepare2crawl(self): for p in range(1, 1435): print("------ Processing page {}".format(p)) url = self.url_template.format(p) header = { "User-Agent": fake_useragent() } req = urllib.request.Request(url, headers=header, method="GET") resp = urllib.request.urlopen(req) if resp.status == 200: html = lxml.html.fromstring(resp.read().decode("utf-8")) divs = html.xpath('//div[@class="content_left_bg"]/div') if len(divs): for n in range(2, len(divs)): tds = divs[n].xpath('.//td') href = tds[2].xpath('./a')[0].attrib["href"] link = self.base_url.format(href) data = { "name": re.sub('律师', '', tds[2].xpath('./a')[0].text_content()), "license": re.split(r'[:|:]', tds[5].text_content())[1], "type": re.split(r'[:|:]', tds[6].text_content())[1], "office": tds[7].text_content().strip(), "expertise": re.split(r'[:|:]', tds[8].text_content())[1], "education": re.split(r'[:|:]', tds[9].text_content())[1], "area": tds[4].text_content().strip(), "link": link, } self.save2db(data) else: print("Error processing page {}".format(p))
def __init__(self): # Init mysql self.conn = mysql.connector.connect(**self.config) self.cursor = self.conn.cursor() self.header = { "User-Agent": fake_useragent() }
def __init__(self): # Init mysql self.conn = mysql.connector.connect(**self.config) self.cursor = self.conn.cursor() self.header = { "User-Agent": fake_useragent() } self.domain = "http://www.cqwsjsw.gov.cn{}" self.url_list = [] self.area = { "1": "巴南区", "2": "北碚区", "3": "北部新区", "4": "璧山区", "5": "长寿区", "6": "城口县", "7": "大渡口区", "8": "大足区", "9": "垫江县", "10": "丰都县", "11": "奉节县", "12": "涪陵区", "13": "合川区", "14": "江北区", "15": "江津区", "16": "九龙坡区", "17": "开县", "18": "梁平县", "19": "南岸区", "20": "南川区", "21": "彭水苗族土家族自治县", "22": "綦江区", "23": "黔江区", "24": "荣昌区", "25": "沙坪坝区", "26": "石柱土家族自治县", "27": "铜梁区", "28": "潼南区", "29": "万盛经开区", "30": "万州区", "31": "巫山县", "32": "巫溪县", "33": "武隆县", "34": "秀山县", "35": "永川区", "36": "酉阳县", "37": "渝北区", "38": "渝中区", "39": "云阳县", "40": "忠县", } self.current_area = "" self.current_area_id = "" self.url_template = "http://www.cqwsjsw.gov.cn/wsfw/jbyf.aspx?fla=sel&name=&Address=&WardId={}&pageNo={}"
def __init__(self): # Init mysql self.conn = mysql.connector.connect(**self.config) self.cursor = self.conn.cursor() self.header = {"User-Agent": fake_useragent()} self.domain = "http://www.cqwsjsw.gov.cn{}" self.url_list = [] self.area = { "1": "巴南区", "2": "北碚区", "3": "北部新区", "4": "璧山区", "5": "长寿区", "6": "城口县", "7": "大渡口区", "8": "大足区", "9": "垫江县", "10": "丰都县", "11": "奉节县", "12": "涪陵区", "13": "合川区", "14": "江北区", "15": "江津区", "16": "九龙坡区", "17": "开县", "18": "梁平县", "19": "南岸区", "20": "南川区", "21": "彭水苗族土家族自治县", "22": "綦江区", "23": "黔江区", "24": "荣昌区", "25": "沙坪坝区", "26": "石柱土家族自治县", "27": "铜梁区", "28": "潼南区", "29": "万盛经开区", "30": "万州区", "31": "巫山县", "32": "巫溪县", "33": "武隆县", "34": "秀山县", "35": "永川区", "36": "酉阳县", "37": "渝北区", "38": "渝中区", "39": "云阳县", "40": "忠县", } self.current_area = "" self.current_area_id = "" self.url_template = "http://www.cqwsjsw.gov.cn/wsfw/jbyf.aspx?fla=sel&name=&Address=&WardId={}&pageNo={}"
def prepare2crawl(): url_template = "http://jyzx.cqgtfw.gov.cn/ngytd/ngytd.asp?Page={}" base_url = "http://jyzx.cqgtfw.gov.cn{}" for p in range(1, 30): print("------ Processing page {}".format(p)) header = { "User-Agent": fake_useragent(), "Cookie": 'safedog-flow-item=68FF67A6B6FD8EB31DDBEC634E6D791A; ASPSESSIONIDACBRDBAR=NCBEMHBDKKIDLJBKGCAILHNM', "Host": "jyzx.cqgtfw.gov.cn", "Referer": "http://jyzx.cqgtfw.gov.cn/ngytd/ngytd.asp" } req = urllib.request.Request(url_template.format(p), headers=header, method="GET") resp = urllib.request.urlopen(req) print(resp.status) if resp.status == 200: # print(resp.read()) # print(resp.read().decode("gbk")) result = resp.read().decode("gbk") html = lxml.html.fromstring(result) trs = html.xpath('//tr[@bgcolor="#ffe8e8"]') print(trs) for tr in trs: tds = tr.xpath('.//td') a = tds[1].xpath('./a')[0] href = a.attrib["href"] url = base_url.format(href.lstrip('.')) location = a.text_content().strip() data = { "location": location, "land_use": tds[2].text_content().strip(), "area": tds[3].text_content().strip(), "allow_area": tds[4].text_content().strip(), "transfer_fee1": tds[5].text_content().strip(), "transfer_fee2": tds[6].text_content().strip(), "remark": tds[7].text_content().strip(), "end_time": tds[8].text_content().strip(), "url": url } print(data) # self.save2db(data) else: print("Error processing page {}".format(p)) break print("------ Finish page {}".format(p))
def login(self): data = { "UserID": "360281198903120044", "loginPassword": "******", } headers = {"User-Agent": fake_useragent()} response = requests.post(self.login_url, headers=headers, data=json.dumps(data)) print(response.cookies.get_dict())
def login(self): data = { "UserID": "360281198903120044", "loginPassword": "******", } headers = { "User-Agent": fake_useragent() } response = requests.post(self.login_url, headers=headers, data=json.dumps(data)) print(response.cookies.get_dict())
def crawl(self): url = "http://ty.cd168.cn/Json/getPoint/" for k in self.cls.keys(): print("++++++++++++++++ {} ++++++++++++++++".format(self.cls[k])) # for _ in range(1, 2): data = { # 行政区划: 0 - 所有区域 "areaid": 0, # 1 - 公共场馆 # 2 - 体育彩票 # 3 - 体质监测 # 4 - 健身路径 # 5 - 健身会所 # 6 - 学校场地 # 7 - 体育培训 # 8 - 体育用品 "categoryid": 8, # 分类项目: 0 - 所有分类 "classid": int(k), # "classid": 0, } self.headers["User-Agent"] = fake_useragent() browser = requests.post(url, headers=self.headers, data=urlencode(data)) if browser.status_code == 200: result = json.loads(browser.text) print(result["Page"]["TotalCount"]) total = int(result["Page"]["TotalPage"]) + 1 for p in range(1, total): print("------ {} ------".format(p)) data["pageindex"] = p browser = requests.post(url, headers=self.headers, data=urlencode(data)) result = json.loads(browser.text) points = result["Point"] for point in points: tmp = { "name": point["PointName"], "principal": point["Principal"], "tel": point["Phone"], "addr": point["Address"], "website": point["WebSite"], "area": self.area[str(point["AreaID"])], "class": self.cls[k], "latitude": str(point["Lat"]), "longitude": str(point["Lon"]) } self.save2db(tmp)
def crawl(self): print("crawling page 1") headers = {"User-Agent": fake_useragent()} browser = requests.get(self.URL, headers=headers) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) view_state_div = html.xpath('//input[@id="__VIEWSTATE"]') self.__VIEWSTATE = view_state_div[0].attrib["value"] self.__EVENTTARGET = "pagerExhibit" self.crawl2() else: print("Error while crawling page 1")
def crawl(self): print("====== Processing page 1 ======") headers = { "User-Agent": fake_useragent(), "Content-Type": "application/x-www-form-urlencoded", } data = { "name": "xxcx_cpzlbljlcxfw", "sql": "", "title": "产品质量监督抽查不良记录查询", "orderby": "", "startpage": "1", "pageSize": "500", "mode": "hdjl", "refresh": "true", "paging": "true", "align": "center", "queryed": "true", "searched": "true", "columnSetting": '[{"code":"cpflmc","show":false},{"code":"cpmc","display":"产品名称","show":true,"width":"250","reminder":""},{"code":"qymc","display":"企业名称","width":"200","reminder":""},{"code":"cjsj","display":"抽检时间","width":"70","reminder":""}]', "searchSetting": '[{"columnname":"cpmc","label":"产品名称","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"1"},{"columnname":"qymc","label":"企业名称","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"2"},{"columnname":"cjsj","label":"抽检时间","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"3"},{"columnname":"cpflmc","label":"产品分类","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"4"}]', "functionSetting": '[]' } browser = requests.post(self.url, headers=headers, data=urlencode(data), timeout=60) if browser.status_code == 200: result = json.loads(browser.text) html = lxml.html.fromstring(result["data"]) trs = html.xpath('//div[@class="mem_tbls"]/table/tr') print(len(trs)) for n in range(1, len(trs)): print("------ processing no {} ------".format(n)) tds = trs[n].xpath('.//td') data = { "name": tds[0].text_content().strip(), "enterprise": tds[1].text_content().strip(), "time": tds[2].text_content().strip(), } self.save2db(data) else: print("Error when crawling page")
def crawl2(self, url): print("processing url: {}".format(url)) headers = { "User-Agent": fake_useragent() } browser = requests.get(url, headers=headers) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) data = { "name": html.xpath('//span[@id="lbUnitName"]')[0].text_content().strip(), "addr": html.xpath('//span[@id="lbAddress"]')[0].text_content().strip(), "level": html.xpath('//span[@id="lbAptGrade"]')[0].text_content().strip(), "certificate_no": html.xpath('//span[@id="lbcertificateNo"]')[0].text_content().strip(), } self.save2db(data) else: print("Error while crawling page {}".format(url))
def prepare2crawl(self): url = "http://ty.cd168.cn/Json/getClass/" data = { "categoryid": 8 } self.headers["User-Agent"] = fake_useragent() browser = requests.post(url, headers=self.headers, data=urlencode(data)) if browser.status_code == 200: tmp = json.loads(browser.text) for item in tmp: tmp = item["ItemName"].strip() self.cls[item["ID"]] = re.sub(r'\s+', '', tmp) self.crawl() else: print("Error getting class")
def crawl(self): print("crawling page 1") headers = { "User-Agent": fake_useragent() } browser = requests.get(self.URL, headers=headers) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) view_state_div = html.xpath('//input[@id="__VIEWSTATE"]') self.__VIEWSTATE = view_state_div[0].attrib["value"] self.__EVENTTARGET = "pagerExhibit" self.crawl2() else: print("Error while crawling page 1")
def crawl(self, url): self.headers["User-Agent"] = fake_useragent() browser = requests.get(url, headers=self.headers) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) title = html.xpath('//span[@id="lblBT"]')[0].text_content().strip() m = re.search(r'(\d+)年(\d+)月(\d+)日', title) year = m.group(1) month = m.group(2) day = m.group(3) month = "0" + month if len(month) == 1 else month day = "0" + day if len(day) == 1 else day date = year + month + day src = html.xpath('//iframe[@id="wzzwInfo"]')[0].attrib["src"] url = self.base_url + src.replace('..', '') print("{0}: {1}".format(date, url)) browser = requests.get(url, headers=self.headers) if browser.encoding != "utf-8": browser.encoding = "gb2312" html = lxml.html.fromstring(browser.text) trs = html.xpath('//tr') for n in range(1, len(trs)): tds = trs[n].xpath('.//td') if len(tds): data = { "no": re.sub(r'\r|\n', '', tds[1].text_content().strip()), "addr": re.sub(r'\r|\n', '', tds[2].text_content().strip()), "area": re.sub(r'\r|\n', '', tds[3].text_content().strip()), "price": re.sub(r'\r|\n', '', tds[4].text_content().strip()), "winner": re.sub(r'\r|\n', '', tds[5].text_content().strip()), "date": date, } print(data) self.save2db(data)
def crawl(self): print("====== Processing page 1 ======") headers = { "User-Agent": fake_useragent(), "Content-Type": "application/x-www-form-urlencoded", } data = { "name": "xxcx_scmpcp", "sql": "", "title": "四川名牌产品", "orderby": "", "startpage": "1", "pageSize": "1000", "mode": "hdjl", "refresh": "true", "paging": "true", "align": "center", "queryed": "true", "searched": "true", "columnSetting": '[{"code":"cpmc","display":"产品名称","show":true,"width":"250","reminder":""},{"code":"qymc","display":"企业名称","width":"200","reminder":""},{"code":"zsbh","display":"证书编号","width":"70","reminder":""},{"code":"xzqh","display":"行政区划","width":"100","reminder":""}]', "searchSetting": '[{"columnname":"cpmc","label":"产品名称","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"1"},{"columnname":"qymc","label":"企业名称","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"2"},{"columnname":"zsbh","label":"证书编号","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"3"},{"columnname":"xzqh","label":"行政区划","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"4"}]', "functionSetting": '[]' } browser = requests.post(self.url, headers=headers, data=urlencode(data), timeout=60) if browser.status_code == 200: result = json.loads(browser.text) html = lxml.html.fromstring(result["data"]) trs = html.xpath('//div[@class="mem_tbls"]/table/tr') print(len(trs)) for n in range(1, len(trs)): print("------ processing no {} ------".format(n)) tds = trs[n].xpath('.//td') data = { "name": tds[0].text_content().strip(), "enterprise": tds[1].text_content().strip(), "certificate_no": tds[2].text_content().strip(), "area": tds[3].text_content().strip(), } self.save2db(data) else: print("Error when crawling page")
def crawl2(self, url): print("processing: {0}".format(url)) self.headers["User-Agent"] = fake_useragent() browser = requests.get(url, headers=self.headers) if browser.status_code == 200: # root = lxml.html.fromstring(browser.text) # lis = root.xpath('//ul[@class="wzsc_bgjd_cs_jbxx"]/li') root = BeautifulSoup(browser.text, 'lxml') lis = root.find('ul', class_="wzsc_bgjd_cs_jbxx").find_all("li") tp = lis[2].contents[1].strip('\r\n').replace('\t', '').replace( ' ', '').strip('\r\n').replace('\r\n', ',') link = lis[7].find('a') website = link.get("href") if link else "#" route = root.find_all( "div", class_="wzsc_bgjd_cs_p contoxt")[1].text.strip() data = { "name": str(lis[0].contents[1]) if len(lis[0].contents) > 1 else "", "license": str(lis[1].contents[1]) if len(lis[1].contents) > 1 else "", "type": tp, "addr": str(lis[3].contents[1]) if len(lis[3].contents) > 1 else "", "tel": str(lis[4].contents[1]) if len(lis[4].contents) > 1 else "", "zip": str(lis[5].contents[1]) if len(lis[5].contents) > 1 else "", "email": str(lis[6].contents[1]) if len(lis[6].contents) > 1 else "", "website": website, "route": route } self.save2db(data) else: print("Error when processing url: {0}".format(url)) time.sleep(random.randint(1, 3))
def crawl2(self, url, msg): print("processing url: {}".format(url)) headers = { "User-Agent": fake_useragent() } browser = requests.get(url, headers=headers) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) td = html.xpath('//*[@id="main_body"]/div[2]/table/tr/td/table/tr[2]/td/table/tr[5]/td') intro = str(td[0].text_content()) data = { "name": msg[0], "type": msg[1], "intro": intro, } self.save2db(data) else: print("Error while crawling page {}".format(p))
def crawl(self): for n in range(1, 137): print("------ Crawling page {} ------".format(n)) url = self.URL.format(n) print(url) headers = { "User-Agent": fake_useragent() } browser = requests.get(url, headers=headers) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) links = html.xpath('//div[@class="search_Newslistinn"]/ul/li/div/a') for link in links: tmp_url = self.BASE_URL + link.attrib["href"] self.crawl2(tmp_url) else: print("Error while crawling page {}".format(n)) time.sleep(random.randint(1, 3))
def crawl2(self, url, msg): print("processing url: {}".format(url)) headers = {"User-Agent": fake_useragent()} browser = requests.get(url, headers=headers) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) td = html.xpath( '//*[@id="main_body"]/div[2]/table/tr/td/table/tr[2]/td/table/tr[5]/td' ) intro = str(td[0].text_content()) data = { "name": msg[0], "type": msg[1], "intro": intro, } self.save2db(data) else: print("Error while crawling page {}".format(p))
def prepare2crawl(self): for p in range(1, 1435): print("------ Processing page {}".format(p)) url = self.url_template.format(p) header = {"User-Agent": fake_useragent()} req = urllib.request.Request(url, headers=header, method="GET") resp = urllib.request.urlopen(req) if resp.status == 200: html = lxml.html.fromstring(resp.read().decode("utf-8")) divs = html.xpath('//div[@class="content_left_bg"]/div') if len(divs): for n in range(2, len(divs)): tds = divs[n].xpath('.//td') href = tds[2].xpath('./a')[0].attrib["href"] link = self.base_url.format(href) data = { "name": re.sub('律师', '', tds[2].xpath('./a')[0].text_content()), "license": re.split(r'[:|:]', tds[5].text_content())[1], "type": re.split(r'[:|:]', tds[6].text_content())[1], "office": tds[7].text_content().strip(), "expertise": re.split(r'[:|:]', tds[8].text_content())[1], "education": re.split(r'[:|:]', tds[9].text_content())[1], "area": tds[4].text_content().strip(), "link": link, } self.save2db(data) else: print("Error processing page {}".format(p))
def prepare2crawl(self): for p in range(10, 13): print("----------- Crawling page {} -----------".format(p)) data = { '__VIEWSTATE': '/wEPDwULLTExNzkxNTY4MjEPZBYCAgMPZBYEAgEPFgIeC18hSXRlbUNvdW50Ag8WHmYPZBYCZg8VAwoyMDE2LzA4LzE5BjMwNjU1NTHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA45pyIMTnml6UpZAIBD2QWAmYPFQMKMjAxNi8wOC8wOQYyOTEyODAx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwOOaciDA55pelKWQCAg9kFgJmDxUDCjIwMTYvMDgvMDQGMjkwNDQ5MeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDjmnIgwNOaXpSlkAgMPZBYCZg8VAwoyMDE2LzA3LzI3BjI4MjkyMDHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA35pyIMjfml6UpZAIED2QWAmYPFQMKMjAxNi8wNy8yMQYyNzY3OTAx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwN+aciDIx5pelKWQCBQ9kFgJmDxUDCjIwMTYvMDcvMTMGMjcwMjMwMeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDfmnIgxM+aXpSlkAgYPZBYCZg8VAwoyMDE2LzA3LzA2BjI2NjY1MzHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA35pyIMDbml6UpZAIHD2QWAmYPFQMKMjAxNi8wNi8yOQYyNTkxMTMx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwNuaciDI55pelKWQCCA9kFgJmDxUDCjIwMTYvMDYvMjEGMjUyNzEzMeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDbmnIgyMeaXpSlkAgkPZBYCZg8VAwoyMDE2LzA2LzE2BjI0ODE5OTHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA25pyIMTbml6UpZAIKD2QWAmYPFQMKMjAxNi8wNi8xNAYyNDM1MjEx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwNuaciDE05pelKWQCCw9kFgJmDxUDCjIwMTYvMDYvMDEGMjM2MTQxMeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDbmnIgwMeaXpSlkAgwPZBYCZg8VAwoyMDE2LzA1LzMxBjIzNTM0NTHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA15pyIMzHml6UpZAIND2QWAmYPFQMKMjAxNi8wNS8yNQYyMzM2MjUx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwNeaciDI15pelKWQCDg9kFgJmDxUDCjIwMTYvMDUvMTkGMjMwNTM1MeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDXmnIgxOeaXpSlkAgMPDxYEHgtSZWNvcmRjb3VudAKpAR4QQ3VycmVudFBhZ2VJbmRleAIBZGRknBpMzCh1lVAb0hQ+KYqaC3XY/ObAUBQzQyX+ubYfCdU=', '__EVENTTARGET': 'Pager', '__EVENTARGUMENT': p, } self.headers["User-Agent"] = fake_useragent() browser = requests.post(self.post_url, headers=self.headers, data=urlencode(data)) if browser.status_code == 200: html = lxml.html.fromstring(browser.text) lis = html.xpath('//div[@class="list1"]/li') for li in lis: href = li.xpath('./a')[0].attrib["href"] url = href.replace('..', '') self.crawl(self.base_url + url) else: print("Error crawling page {}".format(p))
def crawl(self): for m in range(1, 9500): print("====== Processing page {0} ======".format(m)) headers = { "User-Agent": fake_useragent() } browser = requests.get(self.url.format(m), headers=headers) if browser.status_code == 200: root = lxml.html.fromstring(browser.text) divs = root.xpath('//div[@class="lawyerinfo"]') data = { "name": divs[0].xpath("./h3")[0].text_content().strip(), "law_office": divs[0].xpath('./p[1]')[0].text_content().split(':')[1], "gender": divs[0].xpath('./p[2]')[0].text_content().split(':')[1], "license": divs[0].xpath('./p[3]')[0].text_content().split(':')[1], "category": divs[0].xpath('./p[4]')[0].text_content().split(':')[1], } self.save2db(data) else: print("Error when crawling page {0}".format(m)) time.sleep(random.randint(1, 2))
def crawl(self): for p in range(2, 39): print("====== Processing page {0} ======".format(p)) headers = { "User-Agent": fake_useragent() } browser = requests.get(self.url.format(p), headers=headers) if browser.status_code == 200: root = lxml.html.fromstring(browser.text) lis = root.xpath('//div[@class="right_message"]/div[@class="right_hy"]/ul/li') for li in lis: data = { "name": li.xpath('.//div[@class="right_hy_into_name"]')[0].text_content().strip(), "addr": li.xpath('.//div[@class="right_hy_into_zydz"]/span[1]')[0].text_content().strip(), "tel": li.xpath('.//div[@class="right_hy_into_zydz"]/span[2]')[0].text_content().strip(), "operation": li.xpath('.//div[@class="right_hy_into_zydz"]/span[3]')[0].text_content().strip(), } self.save2db(data) else: print("Error when crawling page {0}".format(p)) time.sleep(random.randint(1, 2))
def __init__(self): # Init mysql self.conn = mysql.connector.connect(**self.config) self.cursor = self.conn.cursor() self.header = {"User-Agent": fake_useragent()}
def crawl(self, start, end): for p in range(start, end + 1): headers = { "User-Agent": fake_useragent() } browser = requests.get(self.url_template.format(p), headers=headers) browser.encoding = "utf-8" if browser.status_code == 200: html = lxml.html.fromstring(browser.text) lis = html.xpath('//ul[@class="tabsContainer_ul"]/li') # Universities of current page for li in lis: # University url like: # http://kaoshi.edu.sina.com.cn/college/c/10001.shtml url = li.xpath('./a')[0].attrib["href"] # Extract university id from url match = re.findall(r'/(\d+)\.shtml', url) uid = int(match[0]) if match else 0 divs = li.xpath('.//div[@class="clearfix"]') # Extract name, weibo links = divs[0].xpath('.//a') name = links[0].text_content().strip() num = len(links) weibo_official = "-" weibo_enrollment_office = "-" for n in range(1, num): text = links[n].text_content().strip() if text == "官方微博": weibo_official = links[n].attrib["href"] if text == "招办微博": weibo_enrollment_office = links[n].attrib["href"] # Extract info ps = divs[1].xpath('.//p') location = ps[0].text_content().strip().split(':')[1].strip() utype = ps[2].text_content().strip().split(':')[1].strip() subject_to = ps[4].text_content().strip().split(':')[1].strip() tmp = re.findall(r'(\d+)', ps[1].text_content().strip()) key_discipline = "-" if len(tmp) == 0 else tmp[0] tmp = re.findall(r'(\d+)', ps[3].text_content().strip()) master = "-" if len(tmp) == 0 else tmp[0] tmp = re.findall(r'(\d+)', ps[5].text_content().strip()) doctor = "-" if len(tmp) == 0 else tmp[0] # Extract tags tags = [] spans = divs[2].xpath('.//span') for span in spans: tags.append(span.text_content().strip()) doc = { "uid": uid, # 学校ID "name": name, # 学校名称 "weibo_official": weibo_official, # 官方微博 "weibo_enrollment_office": weibo_enrollment_office, # 招办微博 "location": location, # 所在地 "utype": utype, # 类别 "subject_to": subject_to, # 隶属 "key_discipline": key_discipline, # 重点学科 "master": master, # 硕士点数 "doctor": doctor, # 博士点数 "tags": tags, # 标签 "url": url, # 第二次抓取用url } self.collection.insert_one(doc) print("Page{:>6}: [done]".format(p)) print("Page{:>6}: [done]".format(p), file=self.log) else: print("Page{:>6}: [fail]".format(p)) print("Page{:>6}: [fail]".format(p), file=self.log) # Close file handler self.log.close()