Exemplo n.º 1
0
    def crawl2(self):
        for p in range(self.start_page, self.total_page + 1):
            time.sleep(random.randint(1, 3))

            data = {
                "__VIEWSTATE": self.__VIEWSTATE,
                "__EVENTARGUMENT": p,
                "__EVENTTARGET": self.__EVENTTARGET,
            }

            print("crawling page {}".format(p))
            headers = {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": fake_useragent()
            }
            browser = requests.post(self.URL, headers=headers, data=urlencode(data))
            if browser.status_code == 200:
                html = lxml.html.fromstring(browser.text)
                trs = html.xpath('//table[@id="DataGrid1"]/tr')
                for n in range(1, len(trs)):
                    tds = trs[n].xpath('.//td')
                    data = {
                        "name": tds[1].text_content().strip(),
                        "area": tds[2].text_content().strip(),
                        "addr": tds[3].text_content().strip(),
                        "level": tds[4].text_content().strip(),
                    }
                    print(data)
                    self.save2db(data)
            else:
                print("Error while crawling page {}".format(p))
Exemplo n.º 2
0
    def prepare2crawl(self, start):
        url = "http://www.cqwsjsw.gov.cn/wsfw/yjff.aspx?flag=sel&seldq=0&pageNo={}"

        # for p in range(start, 704):
        for p in range(704, 705):
            print("------ Processing page {} ------".format(p))
            header = {
                "User-Agent": fake_useragent()
            }

            req = urllib.request.Request(url.format(p), headers=header, method="GET")
            resp = urllib.request.urlopen(req)

            if resp.status == 200:
                html = lxml.html.fromstring(resp.read().decode("gbk"))
                # html = lxml.html.fromstring(resp.read().decode("gb18030"))
                tables = html.xpath('//td[@height="34"]/table')
                for n in range(1, len(tables)):
                    tds = tables[n].xpath('.//td')
                    data = {
                        "area": tds[1].text_content().strip(),
                        "street_town": tds[2].text_content().strip(),
                        "community_village": tds[3].text_content().strip(),
                        "work_time": tds[4].text_content().strip(),
                        "service_tel": tds[5].text_content().strip(),
                        "complaint_tel": tds[6].text_content().strip(),
                    }
                    # print(data)
                    self.save2db(data)
            else:
                print("Error processing page {}".format(p))
                break
            print("------ Finish page {} ------".format(p))
            time.sleep(random.randint(1, 3))
Exemplo n.º 3
0
    def crawl2(self, url):
        print("processing: {0}".format(url))
        self.headers["User-Agent"] = fake_useragent()
        browser = requests.get(url, headers=self.headers)
        if browser.status_code == 200:
            # root = lxml.html.fromstring(browser.text)
            # lis = root.xpath('//ul[@class="wzsc_bgjd_cs_jbxx"]/li')

            root = BeautifulSoup(browser.text, 'lxml')
            lis = root.find('ul', class_="wzsc_bgjd_cs_jbxx").find_all("li")

            tp = lis[2].contents[1].strip('\r\n').replace('\t', '').replace(' ', '').strip('\r\n').replace('\r\n', ',')
            link = lis[7].find('a')
            website = link.get("href") if link else "#"
            route = root.find_all("div", class_="wzsc_bgjd_cs_p contoxt")[1].text.strip()

            data = {
                "name": str(lis[0].contents[1]) if len(lis[0].contents) > 1 else "",
                "license": str(lis[1].contents[1]) if len(lis[1].contents) > 1 else "",
                "type": tp,
                "addr": str(lis[3].contents[1]) if len(lis[3].contents) > 1 else "",
                "tel": str(lis[4].contents[1]) if len(lis[4].contents) > 1 else "",
                "zip": str(lis[5].contents[1]) if len(lis[5].contents) > 1 else "",
                "email": str(lis[6].contents[1]) if len(lis[6].contents) > 1 else "",
                "website": website,
                "route": route
            }
            self.save2db(data)

        else:
            print("Error when processing url: {0}".format(url))
        time.sleep(random.randint(1, 3))
Exemplo n.º 4
0
    def crawl(self):
        for m in range(21, 98):
            print("====== Processing page {0} ======".format(m))

            headers = {"User-Agent": fake_useragent()}
            browser = requests.get(self.url.format(m), headers=headers)

            if browser.status_code == 200:
                root = lxml.html.fromstring(browser.text)
                divs = root.xpath('//div[@class="info"]')
                for div in divs:
                    data = {}
                    data["office_name"] = div.xpath(
                        "./h2")[0].text_content().strip()
                    data["competent_bureau"] = div.xpath(
                        "./p[1]")[0].text_content().strip().split(r':')[1]
                    data["license"] = div.xpath(
                        "./p[2]")[0].text_content().strip().split(r':')[1]
                    line = div.xpath("./p[3]")[0].text_content().strip()
                    tmp = re.split(r'\xa0\xa0\xa0\xa0', line)
                    data["telephone"] = tmp[0].split(r':')[1]
                    data["address"] = tmp[1].split(r':')[1]
                    data["director"] = div.xpath(
                        "./p[4]")[0].text_content().strip().split(r':')[1]
                    self.save2db(data)
            else:
                print("Error when crawling page {0}".format(m))

            time.sleep(random.randint(2, 6))
Exemplo n.º 5
0
    def crawl2(self):
        for p in range(self.start_page, self.total_page + 1):
            time.sleep(random.randint(1, 3))

            data = {
                "__VIEWSTATE": self.__VIEWSTATE,
                "__EVENTARGUMENT": p,
                "__EVENTTARGET": self.__EVENTTARGET,
            }

            print("crawling page {}".format(p))
            headers = {
                "Content-Type": "application/x-www-form-urlencoded",
                "User-Agent": fake_useragent()
            }
            browser = requests.post(self.URL,
                                    headers=headers,
                                    data=urlencode(data))
            if browser.status_code == 200:
                html = lxml.html.fromstring(browser.text)
                trs = html.xpath('//table[@id="DataGrid1"]/tr')
                for n in range(1, len(trs)):
                    tds = trs[n].xpath('.//td')
                    data = {
                        "name": tds[1].text_content().strip(),
                        "area": tds[2].text_content().strip(),
                        "addr": tds[3].text_content().strip(),
                        "level": tds[4].text_content().strip(),
                    }
                    print(data)
                    self.save2db(data)
            else:
                print("Error while crawling page {}".format(p))
Exemplo n.º 6
0
    def crawl2(self, url):
        print("processing: {0}".format(url))
        self.headers["User-Agent"] = fake_useragent()
        browser = requests.get(url, headers=self.headers)
        if browser.status_code == 200:
            root = lxml.html.fromstring(browser.text)
            tds = root.xpath('//div[@id="myTab0_Content0"]/table/tr/td')

            data = {
                "name": str(tds[1].text_content()).strip(),
                "license": str(tds[3].text_content()).strip(),
                "director": str(tds[5].text_content()).strip(),
                "representative": str(tds[7].text_content()).strip(),
                "business": str(tds[9].text_content()).strip(),
                "area": str(tds[11].text_content()).strip(),
                "admin_org": str(tds[13].text_content()).strip(),
                "addr": str(tds[15].text_content()).strip(),
                "fax": str(tds[17].text_content()).strip(),
                "zip": str(tds[19].text_content()).strip(),
                "tel": str(tds[21].text_content()).strip(),
                "num": str(tds[23].text_content()).strip(),
            }
            self.save2db(data)

        else:
            print("Error when processing url: {0}".format(url))
        time.sleep(random.randint(1, 3))
Exemplo n.º 7
0
    def crawl2(self, url):
        print("processing: {0}".format(url))
        self.headers["User-Agent"] = fake_useragent()
        browser = requests.get(url, headers=self.headers)
        if browser.status_code == 200:
            root = lxml.html.fromstring(browser.text)
            tds = root.xpath('//div[@id="myTab0_Content0"]/table/tr/td')

            data = {
                "name": str(tds[1].text_content()).strip(),
                "license": str(tds[3].text_content()).strip(),
                "director": str(tds[5].text_content()).strip(),
                "representative": str(tds[7].text_content()).strip(),
                "business": str(tds[9].text_content()).strip(),
                "area": str(tds[11].text_content()).strip(),
                "admin_org": str(tds[13].text_content()).strip(),
                "addr": str(tds[15].text_content()).strip(),
                "fax": str(tds[17].text_content()).strip(),
                "zip": str(tds[19].text_content()).strip(),
                "tel": str(tds[21].text_content()).strip(),
                "num": str(tds[23].text_content()).strip(),
            }
            self.save2db(data)

        else:
            print("Error when processing url: {0}".format(url))
        time.sleep(random.randint(1, 3))
Exemplo n.º 8
0
    def crawl(self):
        for m in range(1, 9500):
            print("====== Processing page {0} ======".format(m))

            headers = {"User-Agent": fake_useragent()}
            browser = requests.get(self.url.format(m), headers=headers)

            if browser.status_code == 200:
                root = lxml.html.fromstring(browser.text)
                divs = root.xpath('//div[@class="lawyerinfo"]')
                data = {
                    "name":
                    divs[0].xpath("./h3")[0].text_content().strip(),
                    "law_office":
                    divs[0].xpath('./p[1]')[0].text_content().split(':')[1],
                    "gender":
                    divs[0].xpath('./p[2]')[0].text_content().split(':')[1],
                    "license":
                    divs[0].xpath('./p[3]')[0].text_content().split(':')[1],
                    "category":
                    divs[0].xpath('./p[4]')[0].text_content().split(':')[1],
                }
                self.save2db(data)
            else:
                print("Error when crawling page {0}".format(m))

            time.sleep(random.randint(1, 2))
Exemplo n.º 9
0
    def crawl(self):
        for p in range(2, 39):
            print("====== Processing page {0} ======".format(p))

            headers = {"User-Agent": fake_useragent()}
            browser = requests.get(self.url.format(p), headers=headers)

            if browser.status_code == 200:
                root = lxml.html.fromstring(browser.text)
                lis = root.xpath(
                    '//div[@class="right_message"]/div[@class="right_hy"]/ul/li'
                )
                for li in lis:
                    data = {
                        "name":
                        li.xpath('.//div[@class="right_hy_into_name"]')
                        [0].text_content().strip(),
                        "addr":
                        li.xpath('.//div[@class="right_hy_into_zydz"]/span[1]')
                        [0].text_content().strip(),
                        "tel":
                        li.xpath('.//div[@class="right_hy_into_zydz"]/span[2]')
                        [0].text_content().strip(),
                        "operation":
                        li.xpath('.//div[@class="right_hy_into_zydz"]/span[3]')
                        [0].text_content().strip(),
                    }
                    self.save2db(data)
            else:
                print("Error when crawling page {0}".format(p))

            time.sleep(random.randint(1, 2))
Exemplo n.º 10
0
    def prepare2crawl(self):
        for p in range(1, 1435):
            print("------ Processing page {}".format(p))
            url = self.url_template.format(p)

            header = {
                "User-Agent": fake_useragent()
            }

            req = urllib.request.Request(url, headers=header, method="GET")
            resp = urllib.request.urlopen(req)

            if resp.status == 200:
                html = lxml.html.fromstring(resp.read().decode("utf-8"))

                divs = html.xpath('//div[@class="content_left_bg"]/div')
                if len(divs):
                    for n in range(2, len(divs)):
                        tds = divs[n].xpath('.//td')
                        href = tds[2].xpath('./a')[0].attrib["href"]
                        link = self.base_url.format(href)
                        data = {
                            "name": re.sub('律师', '', tds[2].xpath('./a')[0].text_content()),
                            "license": re.split(r'[:|:]', tds[5].text_content())[1],
                            "type": re.split(r'[:|:]', tds[6].text_content())[1],
                            "office": tds[7].text_content().strip(),
                            "expertise": re.split(r'[:|:]', tds[8].text_content())[1],
                            "education": re.split(r'[:|:]', tds[9].text_content())[1],
                            "area": tds[4].text_content().strip(),
                            "link": link,
                        }
                        self.save2db(data)
            else:
                print("Error processing page {}".format(p))
Exemplo n.º 11
0
    def __init__(self):
        # Init mysql
        self.conn = mysql.connector.connect(**self.config)
        self.cursor = self.conn.cursor()

        self.header = {
            "User-Agent": fake_useragent()
        }
Exemplo n.º 12
0
    def __init__(self):
        # Init mysql
        self.conn = mysql.connector.connect(**self.config)
        self.cursor = self.conn.cursor()

        self.header = {
            "User-Agent": fake_useragent()
        }

        self.domain = "http://www.cqwsjsw.gov.cn{}"
        self.url_list = []
        self.area = {
            "1": "巴南区",
            "2": "北碚区",
            "3": "北部新区",
            "4": "璧山区",
            "5": "长寿区",
            "6": "城口县",
            "7": "大渡口区",
            "8": "大足区",
            "9": "垫江县",
            "10": "丰都县",
            "11": "奉节县",
            "12": "涪陵区",
            "13": "合川区",
            "14": "江北区",
            "15": "江津区",
            "16": "九龙坡区",
            "17": "开县",
            "18": "梁平县",
            "19": "南岸区",
            "20": "南川区",
            "21": "彭水苗族土家族自治县",
            "22": "綦江区",
            "23": "黔江区",
            "24": "荣昌区",
            "25": "沙坪坝区",
            "26": "石柱土家族自治县",
            "27": "铜梁区",
            "28": "潼南区",
            "29": "万盛经开区",
            "30": "万州区",
            "31": "巫山县",
            "32": "巫溪县",
            "33": "武隆县",
            "34": "秀山县",
            "35": "永川区",
            "36": "酉阳县",
            "37": "渝北区",
            "38": "渝中区",
            "39": "云阳县",
            "40": "忠县",
        }
        self.current_area = ""
        self.current_area_id = ""
        self.url_template = "http://www.cqwsjsw.gov.cn/wsfw/jbyf.aspx?fla=sel&name=&Address=&WardId={}&pageNo={}"
Exemplo n.º 13
0
    def __init__(self):
        # Init mysql
        self.conn = mysql.connector.connect(**self.config)
        self.cursor = self.conn.cursor()

        self.header = {"User-Agent": fake_useragent()}

        self.domain = "http://www.cqwsjsw.gov.cn{}"
        self.url_list = []
        self.area = {
            "1": "巴南区",
            "2": "北碚区",
            "3": "北部新区",
            "4": "璧山区",
            "5": "长寿区",
            "6": "城口县",
            "7": "大渡口区",
            "8": "大足区",
            "9": "垫江县",
            "10": "丰都县",
            "11": "奉节县",
            "12": "涪陵区",
            "13": "合川区",
            "14": "江北区",
            "15": "江津区",
            "16": "九龙坡区",
            "17": "开县",
            "18": "梁平县",
            "19": "南岸区",
            "20": "南川区",
            "21": "彭水苗族土家族自治县",
            "22": "綦江区",
            "23": "黔江区",
            "24": "荣昌区",
            "25": "沙坪坝区",
            "26": "石柱土家族自治县",
            "27": "铜梁区",
            "28": "潼南区",
            "29": "万盛经开区",
            "30": "万州区",
            "31": "巫山县",
            "32": "巫溪县",
            "33": "武隆县",
            "34": "秀山县",
            "35": "永川区",
            "36": "酉阳县",
            "37": "渝北区",
            "38": "渝中区",
            "39": "云阳县",
            "40": "忠县",
        }
        self.current_area = ""
        self.current_area_id = ""
        self.url_template = "http://www.cqwsjsw.gov.cn/wsfw/jbyf.aspx?fla=sel&name=&Address=&WardId={}&pageNo={}"
Exemplo n.º 14
0
    def prepare2crawl():
        url_template = "http://jyzx.cqgtfw.gov.cn/ngytd/ngytd.asp?Page={}"
        base_url = "http://jyzx.cqgtfw.gov.cn{}"

        for p in range(1, 30):
            print("------ Processing page {}".format(p))
            header = {
                "User-Agent": fake_useragent(),
                "Cookie":
                'safedog-flow-item=68FF67A6B6FD8EB31DDBEC634E6D791A; ASPSESSIONIDACBRDBAR=NCBEMHBDKKIDLJBKGCAILHNM',
                "Host": "jyzx.cqgtfw.gov.cn",
                "Referer": "http://jyzx.cqgtfw.gov.cn/ngytd/ngytd.asp"
            }

            req = urllib.request.Request(url_template.format(p),
                                         headers=header,
                                         method="GET")
            resp = urllib.request.urlopen(req)

            print(resp.status)
            if resp.status == 200:
                # print(resp.read())
                # print(resp.read().decode("gbk"))
                result = resp.read().decode("gbk")
                html = lxml.html.fromstring(result)
                trs = html.xpath('//tr[@bgcolor="#ffe8e8"]')
                print(trs)
                for tr in trs:
                    tds = tr.xpath('.//td')
                    a = tds[1].xpath('./a')[0]
                    href = a.attrib["href"]
                    url = base_url.format(href.lstrip('.'))
                    location = a.text_content().strip()
                    data = {
                        "location": location,
                        "land_use": tds[2].text_content().strip(),
                        "area": tds[3].text_content().strip(),
                        "allow_area": tds[4].text_content().strip(),
                        "transfer_fee1": tds[5].text_content().strip(),
                        "transfer_fee2": tds[6].text_content().strip(),
                        "remark": tds[7].text_content().strip(),
                        "end_time": tds[8].text_content().strip(),
                        "url": url
                    }
                    print(data)
                    # self.save2db(data)
            else:
                print("Error processing page {}".format(p))
                break

            print("------ Finish page {}".format(p))
Exemplo n.º 15
0
    def login(self):

        data = {
            "UserID": "360281198903120044",
            "loginPassword": "******",
        }

        headers = {"User-Agent": fake_useragent()}

        response = requests.post(self.login_url,
                                 headers=headers,
                                 data=json.dumps(data))

        print(response.cookies.get_dict())
Exemplo n.º 16
0
    def login(self):

        data = {
            "UserID": "360281198903120044",
            "loginPassword": "******",
        }

        headers = {
            "User-Agent": fake_useragent()
        }

        response = requests.post(self.login_url, headers=headers, data=json.dumps(data))

        print(response.cookies.get_dict())
Exemplo n.º 17
0
    def prepare2crawl():
        url_template = "http://jyzx.cqgtfw.gov.cn/ngytd/ngytd.asp?Page={}"
        base_url = "http://jyzx.cqgtfw.gov.cn{}"

        for p in range(1, 30):
            print("------ Processing page {}".format(p))
            header = {
                "User-Agent": fake_useragent(),
                "Cookie": 'safedog-flow-item=68FF67A6B6FD8EB31DDBEC634E6D791A; ASPSESSIONIDACBRDBAR=NCBEMHBDKKIDLJBKGCAILHNM',
                "Host": "jyzx.cqgtfw.gov.cn",
                "Referer": "http://jyzx.cqgtfw.gov.cn/ngytd/ngytd.asp"

            }

            req = urllib.request.Request(url_template.format(p), headers=header, method="GET")
            resp = urllib.request.urlopen(req)

            print(resp.status)
            if resp.status == 200:
                # print(resp.read())
                # print(resp.read().decode("gbk"))
                result = resp.read().decode("gbk")
                html = lxml.html.fromstring(result)
                trs = html.xpath('//tr[@bgcolor="#ffe8e8"]')
                print(trs)
                for tr in trs:
                    tds = tr.xpath('.//td')
                    a = tds[1].xpath('./a')[0]
                    href = a.attrib["href"]
                    url = base_url.format(href.lstrip('.'))
                    location = a.text_content().strip()
                    data = {
                        "location": location,
                        "land_use": tds[2].text_content().strip(),
                        "area": tds[3].text_content().strip(),
                        "allow_area": tds[4].text_content().strip(),
                        "transfer_fee1": tds[5].text_content().strip(),
                        "transfer_fee2": tds[6].text_content().strip(),
                        "remark": tds[7].text_content().strip(),
                        "end_time": tds[8].text_content().strip(),
                        "url": url
                    }
                    print(data)
                    # self.save2db(data)
            else:
                print("Error processing page {}".format(p))
                break

            print("------ Finish page {}".format(p))
Exemplo n.º 18
0
    def crawl(self):
        url = "http://ty.cd168.cn/Json/getPoint/"

        for k in self.cls.keys():
            print("++++++++++++++++ {} ++++++++++++++++".format(self.cls[k]))
        # for _ in range(1, 2):

            data = {
                # 行政区划: 0 - 所有区域
                "areaid": 0,
                # 1 - 公共场馆
                # 2 - 体育彩票
                # 3 - 体质监测
                # 4 - 健身路径
                # 5 - 健身会所
                # 6 - 学校场地
                # 7 - 体育培训
                # 8 - 体育用品
                "categoryid": 8,
                # 分类项目: 0 - 所有分类
                "classid": int(k),
                # "classid": 0,
            }
            self.headers["User-Agent"] = fake_useragent()
            browser = requests.post(url, headers=self.headers, data=urlencode(data))
            if browser.status_code == 200:
                result = json.loads(browser.text)
                print(result["Page"]["TotalCount"])
                total = int(result["Page"]["TotalPage"]) + 1
                for p in range(1, total):
                    print("------ {} ------".format(p))
                    data["pageindex"] = p
                    browser = requests.post(url, headers=self.headers, data=urlencode(data))
                    result = json.loads(browser.text)

                    points = result["Point"]
                    for point in points:
                        tmp = {
                            "name": point["PointName"],
                            "principal": point["Principal"],
                            "tel": point["Phone"],
                            "addr": point["Address"],
                            "website": point["WebSite"],
                            "area": self.area[str(point["AreaID"])],
                            "class": self.cls[k],
                            "latitude": str(point["Lat"]),
                            "longitude": str(point["Lon"])
                        }
                        self.save2db(tmp)
Exemplo n.º 19
0
    def crawl(self):
        print("crawling page 1")
        headers = {"User-Agent": fake_useragent()}
        browser = requests.get(self.URL, headers=headers)
        if browser.status_code == 200:
            html = lxml.html.fromstring(browser.text)

            view_state_div = html.xpath('//input[@id="__VIEWSTATE"]')
            self.__VIEWSTATE = view_state_div[0].attrib["value"]
            self.__EVENTTARGET = "pagerExhibit"

            self.crawl2()

        else:
            print("Error while crawling page 1")
Exemplo n.º 20
0
    def crawl(self):
        print("====== Processing page 1 ======")

        headers = {
            "User-Agent": fake_useragent(),
            "Content-Type": "application/x-www-form-urlencoded",
        }
        data = {
            "name": "xxcx_cpzlbljlcxfw",
            "sql": "",
            "title": "产品质量监督抽查不良记录查询",
            "orderby": "",
            "startpage": "1",
            "pageSize": "500",
            "mode": "hdjl",
            "refresh": "true",
            "paging": "true",
            "align": "center",
            "queryed": "true",
            "searched": "true",
            "columnSetting":
            '[{"code":"cpflmc","show":false},{"code":"cpmc","display":"产品名称","show":true,"width":"250","reminder":""},{"code":"qymc","display":"企业名称","width":"200","reminder":""},{"code":"cjsj","display":"抽检时间","width":"70","reminder":""}]',
            "searchSetting":
            '[{"columnname":"cpmc","label":"产品名称","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"1"},{"columnname":"qymc","label":"企业名称","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"2"},{"columnname":"cjsj","label":"抽检时间","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"3"},{"columnname":"cpflmc","label":"产品分类","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"4"}]',
            "functionSetting": '[]'
        }
        browser = requests.post(self.url,
                                headers=headers,
                                data=urlencode(data),
                                timeout=60)

        if browser.status_code == 200:
            result = json.loads(browser.text)
            html = lxml.html.fromstring(result["data"])
            trs = html.xpath('//div[@class="mem_tbls"]/table/tr')
            print(len(trs))
            for n in range(1, len(trs)):
                print("------ processing no {} ------".format(n))
                tds = trs[n].xpath('.//td')
                data = {
                    "name": tds[0].text_content().strip(),
                    "enterprise": tds[1].text_content().strip(),
                    "time": tds[2].text_content().strip(),
                }
                self.save2db(data)
        else:
            print("Error when crawling page")
Exemplo n.º 21
0
 def crawl2(self, url):
     print("processing url: {}".format(url))
     headers = {
         "User-Agent": fake_useragent()
     }
     browser = requests.get(url, headers=headers)
     if browser.status_code == 200:
         html = lxml.html.fromstring(browser.text)
         data = {
             "name": html.xpath('//span[@id="lbUnitName"]')[0].text_content().strip(),
             "addr": html.xpath('//span[@id="lbAddress"]')[0].text_content().strip(),
             "level": html.xpath('//span[@id="lbAptGrade"]')[0].text_content().strip(),
             "certificate_no": html.xpath('//span[@id="lbcertificateNo"]')[0].text_content().strip(),
         }
         self.save2db(data)
     else:
         print("Error while crawling page {}".format(url))
Exemplo n.º 22
0
    def prepare2crawl(self):
        url = "http://ty.cd168.cn/Json/getClass/"

        data = {
            "categoryid": 8
        }
        self.headers["User-Agent"] = fake_useragent()
        browser = requests.post(url, headers=self.headers, data=urlencode(data))
        if browser.status_code == 200:
            tmp = json.loads(browser.text)
            for item in tmp:
                tmp = item["ItemName"].strip()
                self.cls[item["ID"]] = re.sub(r'\s+', '', tmp)

            self.crawl()
        else:
            print("Error getting class")
Exemplo n.º 23
0
    def crawl(self):
        print("crawling page 1")
        headers = {
            "User-Agent": fake_useragent()
        }
        browser = requests.get(self.URL, headers=headers)
        if browser.status_code == 200:
            html = lxml.html.fromstring(browser.text)

            view_state_div = html.xpath('//input[@id="__VIEWSTATE"]')
            self.__VIEWSTATE = view_state_div[0].attrib["value"]
            self.__EVENTTARGET = "pagerExhibit"

            self.crawl2()

        else:
            print("Error while crawling page 1")
Exemplo n.º 24
0
    def crawl(self, url):
        self.headers["User-Agent"] = fake_useragent()
        browser = requests.get(url, headers=self.headers)
        if browser.status_code == 200:
            html = lxml.html.fromstring(browser.text)

            title = html.xpath('//span[@id="lblBT"]')[0].text_content().strip()
            m = re.search(r'(\d+)年(\d+)月(\d+)日', title)
            year = m.group(1)
            month = m.group(2)
            day = m.group(3)

            month = "0" + month if len(month) == 1 else month
            day = "0" + day if len(day) == 1 else day

            date = year + month + day

            src = html.xpath('//iframe[@id="wzzwInfo"]')[0].attrib["src"]
            url = self.base_url + src.replace('..', '')

            print("{0}: {1}".format(date, url))
            browser = requests.get(url, headers=self.headers)
            if browser.encoding != "utf-8":
                browser.encoding = "gb2312"

            html = lxml.html.fromstring(browser.text)
            trs = html.xpath('//tr')
            for n in range(1, len(trs)):
                tds = trs[n].xpath('.//td')
                if len(tds):
                    data = {
                        "no":
                        re.sub(r'\r|\n', '', tds[1].text_content().strip()),
                        "addr":
                        re.sub(r'\r|\n', '', tds[2].text_content().strip()),
                        "area":
                        re.sub(r'\r|\n', '', tds[3].text_content().strip()),
                        "price":
                        re.sub(r'\r|\n', '', tds[4].text_content().strip()),
                        "winner":
                        re.sub(r'\r|\n', '', tds[5].text_content().strip()),
                        "date":
                        date,
                    }
                    print(data)
                    self.save2db(data)
Exemplo n.º 25
0
    def crawl(self):
        print("====== Processing page 1 ======")

        headers = {
            "User-Agent": fake_useragent(),
            "Content-Type": "application/x-www-form-urlencoded",
        }
        data = {
            "name": "xxcx_scmpcp",
            "sql": "",
            "title": "四川名牌产品",
            "orderby": "",
            "startpage": "1",
            "pageSize": "1000",
            "mode": "hdjl",
            "refresh": "true",
            "paging": "true",
            "align": "center",
            "queryed": "true",
            "searched": "true",
            "columnSetting": '[{"code":"cpmc","display":"产品名称","show":true,"width":"250","reminder":""},{"code":"qymc","display":"企业名称","width":"200","reminder":""},{"code":"zsbh","display":"证书编号","width":"70","reminder":""},{"code":"xzqh","display":"行政区划","width":"100","reminder":""}]',
            "searchSetting": '[{"columnname":"cpmc","label":"产品名称","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"1"},{"columnname":"qymc","label":"企业名称","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"2"},{"columnname":"zsbh","label":"证书编号","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"3"},{"columnname":"xzqh","label":"行政区划","labelWidth":"100","columnSpan":"","inputWidth":"185","inputSpan":"","compare":"like","columnvalue":"","rownum":"1","colnum":"4"}]',
            "functionSetting": '[]'
        }
        browser = requests.post(self.url, headers=headers, data=urlencode(data), timeout=60)

        if browser.status_code == 200:
            result = json.loads(browser.text)
            html = lxml.html.fromstring(result["data"])
            trs = html.xpath('//div[@class="mem_tbls"]/table/tr')
            print(len(trs))
            for n in range(1, len(trs)):
                print("------ processing no {} ------".format(n))
                tds = trs[n].xpath('.//td')
                data = {
                    "name": tds[0].text_content().strip(),
                    "enterprise": tds[1].text_content().strip(),
                    "certificate_no": tds[2].text_content().strip(),
                    "area": tds[3].text_content().strip(),
                }
                self.save2db(data)

        else:
            print("Error when crawling page")
Exemplo n.º 26
0
    def crawl2(self, url):
        print("processing: {0}".format(url))
        self.headers["User-Agent"] = fake_useragent()
        browser = requests.get(url, headers=self.headers)
        if browser.status_code == 200:
            # root = lxml.html.fromstring(browser.text)
            # lis = root.xpath('//ul[@class="wzsc_bgjd_cs_jbxx"]/li')

            root = BeautifulSoup(browser.text, 'lxml')
            lis = root.find('ul', class_="wzsc_bgjd_cs_jbxx").find_all("li")

            tp = lis[2].contents[1].strip('\r\n').replace('\t', '').replace(
                ' ', '').strip('\r\n').replace('\r\n', ',')
            link = lis[7].find('a')
            website = link.get("href") if link else "#"
            route = root.find_all(
                "div", class_="wzsc_bgjd_cs_p contoxt")[1].text.strip()

            data = {
                "name":
                str(lis[0].contents[1]) if len(lis[0].contents) > 1 else "",
                "license":
                str(lis[1].contents[1]) if len(lis[1].contents) > 1 else "",
                "type":
                tp,
                "addr":
                str(lis[3].contents[1]) if len(lis[3].contents) > 1 else "",
                "tel":
                str(lis[4].contents[1]) if len(lis[4].contents) > 1 else "",
                "zip":
                str(lis[5].contents[1]) if len(lis[5].contents) > 1 else "",
                "email":
                str(lis[6].contents[1]) if len(lis[6].contents) > 1 else "",
                "website":
                website,
                "route":
                route
            }
            self.save2db(data)

        else:
            print("Error when processing url: {0}".format(url))
        time.sleep(random.randint(1, 3))
Exemplo n.º 27
0
    def crawl2(self, url, msg):
        print("processing url: {}".format(url))
        headers = {
            "User-Agent": fake_useragent()
        }
        browser = requests.get(url, headers=headers)
        if browser.status_code == 200:
            html = lxml.html.fromstring(browser.text)
            td = html.xpath('//*[@id="main_body"]/div[2]/table/tr/td/table/tr[2]/td/table/tr[5]/td')
            intro = str(td[0].text_content())

            data = {
                "name": msg[0],
                "type": msg[1],
                "intro": intro,
            }
            self.save2db(data)

        else:
            print("Error while crawling page {}".format(p))
Exemplo n.º 28
0
    def crawl(self):
        for n in range(1, 137):
            print("------ Crawling page {} ------".format(n))
            url = self.URL.format(n)
            print(url)
            headers = {
                "User-Agent": fake_useragent()
            }
            browser = requests.get(url, headers=headers)

            if browser.status_code == 200:
                html = lxml.html.fromstring(browser.text)
                links = html.xpath('//div[@class="search_Newslistinn"]/ul/li/div/a')
                for link in links:
                    tmp_url = self.BASE_URL + link.attrib["href"]
                    self.crawl2(tmp_url)
            else:
                print("Error while crawling page {}".format(n))

            time.sleep(random.randint(1, 3))
Exemplo n.º 29
0
    def crawl2(self, url, msg):
        print("processing url: {}".format(url))
        headers = {"User-Agent": fake_useragent()}
        browser = requests.get(url, headers=headers)
        if browser.status_code == 200:
            html = lxml.html.fromstring(browser.text)
            td = html.xpath(
                '//*[@id="main_body"]/div[2]/table/tr/td/table/tr[2]/td/table/tr[5]/td'
            )
            intro = str(td[0].text_content())

            data = {
                "name": msg[0],
                "type": msg[1],
                "intro": intro,
            }
            self.save2db(data)

        else:
            print("Error while crawling page {}".format(p))
Exemplo n.º 30
0
    def prepare2crawl(self):
        for p in range(1, 1435):
            print("------ Processing page {}".format(p))
            url = self.url_template.format(p)

            header = {"User-Agent": fake_useragent()}

            req = urllib.request.Request(url, headers=header, method="GET")
            resp = urllib.request.urlopen(req)

            if resp.status == 200:
                html = lxml.html.fromstring(resp.read().decode("utf-8"))

                divs = html.xpath('//div[@class="content_left_bg"]/div')
                if len(divs):
                    for n in range(2, len(divs)):
                        tds = divs[n].xpath('.//td')
                        href = tds[2].xpath('./a')[0].attrib["href"]
                        link = self.base_url.format(href)
                        data = {
                            "name":
                            re.sub('律师', '',
                                   tds[2].xpath('./a')[0].text_content()),
                            "license":
                            re.split(r'[:|:]', tds[5].text_content())[1],
                            "type":
                            re.split(r'[:|:]', tds[6].text_content())[1],
                            "office":
                            tds[7].text_content().strip(),
                            "expertise":
                            re.split(r'[:|:]', tds[8].text_content())[1],
                            "education":
                            re.split(r'[:|:]', tds[9].text_content())[1],
                            "area":
                            tds[4].text_content().strip(),
                            "link":
                            link,
                        }
                        self.save2db(data)
            else:
                print("Error processing page {}".format(p))
Exemplo n.º 31
0
    def prepare2crawl(self):
        for p in range(10, 13):
            print("----------- Crawling page {} -----------".format(p))

            data = {
                '__VIEWSTATE': '/wEPDwULLTExNzkxNTY4MjEPZBYCAgMPZBYEAgEPFgIeC18hSXRlbUNvdW50Ag8WHmYPZBYCZg8VAwoyMDE2LzA4LzE5BjMwNjU1NTHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA45pyIMTnml6UpZAIBD2QWAmYPFQMKMjAxNi8wOC8wOQYyOTEyODAx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwOOaciDA55pelKWQCAg9kFgJmDxUDCjIwMTYvMDgvMDQGMjkwNDQ5MeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDjmnIgwNOaXpSlkAgMPZBYCZg8VAwoyMDE2LzA3LzI3BjI4MjkyMDHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA35pyIMjfml6UpZAIED2QWAmYPFQMKMjAxNi8wNy8yMQYyNzY3OTAx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwN+aciDIx5pelKWQCBQ9kFgJmDxUDCjIwMTYvMDcvMTMGMjcwMjMwMeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDfmnIgxM+aXpSlkAgYPZBYCZg8VAwoyMDE2LzA3LzA2BjI2NjY1MzHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA35pyIMDbml6UpZAIHD2QWAmYPFQMKMjAxNi8wNi8yOQYyNTkxMTMx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwNuaciDI55pelKWQCCA9kFgJmDxUDCjIwMTYvMDYvMjEGMjUyNzEzMeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDbmnIgyMeaXpSlkAgkPZBYCZg8VAwoyMDE2LzA2LzE2BjI0ODE5OTHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA25pyIMTbml6UpZAIKD2QWAmYPFQMKMjAxNi8wNi8xNAYyNDM1MjEx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwNuaciDE05pelKWQCCw9kFgJmDxUDCjIwMTYvMDYvMDEGMjM2MTQxMeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDbmnIgwMeaXpSlkAgwPZBYCZg8VAwoyMDE2LzA1LzMxBjIzNTM0NTHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA15pyIMzHml6UpZAIND2QWAmYPFQMKMjAxNi8wNS8yNQYyMzM2MjUx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwNeaciDI15pelKWQCDg9kFgJmDxUDCjIwMTYvMDUvMTkGMjMwNTM1MeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDXmnIgxOeaXpSlkAgMPDxYEHgtSZWNvcmRjb3VudAKpAR4QQ3VycmVudFBhZ2VJbmRleAIBZGRknBpMzCh1lVAb0hQ+KYqaC3XY/ObAUBQzQyX+ubYfCdU=',
                '__EVENTTARGET': 'Pager',
                '__EVENTARGUMENT': p,
            }
            self.headers["User-Agent"] = fake_useragent()
            browser = requests.post(self.post_url, headers=self.headers, data=urlencode(data))
            if browser.status_code == 200:
                html = lxml.html.fromstring(browser.text)
                lis = html.xpath('//div[@class="list1"]/li')
                for li in lis:
                    href = li.xpath('./a')[0].attrib["href"]
                    url = href.replace('..', '')

                    self.crawl(self.base_url + url)
            else:
                print("Error crawling page {}".format(p))
Exemplo n.º 32
0
    def crawl(self, url):
        self.headers["User-Agent"] = fake_useragent()
        browser = requests.get(url, headers=self.headers)
        if browser.status_code == 200:
            html = lxml.html.fromstring(browser.text)

            title = html.xpath('//span[@id="lblBT"]')[0].text_content().strip()
            m = re.search(r'(\d+)年(\d+)月(\d+)日', title)
            year = m.group(1)
            month = m.group(2)
            day = m.group(3)

            month = "0" + month if len(month) == 1 else month
            day = "0" + day if len(day) == 1 else day

            date = year + month + day

            src = html.xpath('//iframe[@id="wzzwInfo"]')[0].attrib["src"]
            url = self.base_url + src.replace('..', '')

            print("{0}: {1}".format(date, url))
            browser = requests.get(url, headers=self.headers)
            if browser.encoding != "utf-8":
                browser.encoding = "gb2312"

            html = lxml.html.fromstring(browser.text)
            trs = html.xpath('//tr')
            for n in range(1, len(trs)):
                tds = trs[n].xpath('.//td')
                if len(tds):
                    data = {
                        "no": re.sub(r'\r|\n', '', tds[1].text_content().strip()),
                        "addr": re.sub(r'\r|\n', '', tds[2].text_content().strip()),
                        "area": re.sub(r'\r|\n', '', tds[3].text_content().strip()),
                        "price": re.sub(r'\r|\n', '', tds[4].text_content().strip()),
                        "winner": re.sub(r'\r|\n', '', tds[5].text_content().strip()),
                        "date": date,
                    }
                    print(data)
                    self.save2db(data)
Exemplo n.º 33
0
    def crawl(self):
        for m in range(1, 9500):
            print("====== Processing page {0} ======".format(m))

            headers = {
                "User-Agent": fake_useragent()
            }
            browser = requests.get(self.url.format(m), headers=headers)

            if browser.status_code == 200:
                root = lxml.html.fromstring(browser.text)
                divs = root.xpath('//div[@class="lawyerinfo"]')
                data = {
                    "name": divs[0].xpath("./h3")[0].text_content().strip(),
                    "law_office": divs[0].xpath('./p[1]')[0].text_content().split(':')[1],
                    "gender": divs[0].xpath('./p[2]')[0].text_content().split(':')[1],
                    "license": divs[0].xpath('./p[3]')[0].text_content().split(':')[1],
                    "category": divs[0].xpath('./p[4]')[0].text_content().split(':')[1],
                }
                self.save2db(data)
            else:
                print("Error when crawling page {0}".format(m))

            time.sleep(random.randint(1, 2))
Exemplo n.º 34
0
    def prepare2crawl(self):
        for p in range(10, 13):
            print("----------- Crawling page {} -----------".format(p))

            data = {
                '__VIEWSTATE':
                '/wEPDwULLTExNzkxNTY4MjEPZBYCAgMPZBYEAgEPFgIeC18hSXRlbUNvdW50Ag8WHmYPZBYCZg8VAwoyMDE2LzA4LzE5BjMwNjU1NTHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA45pyIMTnml6UpZAIBD2QWAmYPFQMKMjAxNi8wOC8wOQYyOTEyODAx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwOOaciDA55pelKWQCAg9kFgJmDxUDCjIwMTYvMDgvMDQGMjkwNDQ5MeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDjmnIgwNOaXpSlkAgMPZBYCZg8VAwoyMDE2LzA3LzI3BjI4MjkyMDHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA35pyIMjfml6UpZAIED2QWAmYPFQMKMjAxNi8wNy8yMQYyNzY3OTAx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwN+aciDIx5pelKWQCBQ9kFgJmDxUDCjIwMTYvMDcvMTMGMjcwMjMwMeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDfmnIgxM+aXpSlkAgYPZBYCZg8VAwoyMDE2LzA3LzA2BjI2NjY1MzHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA35pyIMDbml6UpZAIHD2QWAmYPFQMKMjAxNi8wNi8yOQYyNTkxMTMx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwNuaciDI55pelKWQCCA9kFgJmDxUDCjIwMTYvMDYvMjEGMjUyNzEzMeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDbmnIgyMeaXpSlkAgkPZBYCZg8VAwoyMDE2LzA2LzE2BjI0ODE5OTHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA25pyIMTbml6UpZAIKD2QWAmYPFQMKMjAxNi8wNi8xNAYyNDM1MjEx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwNuaciDE05pelKWQCCw9kFgJmDxUDCjIwMTYvMDYvMDEGMjM2MTQxMeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDbmnIgwMeaXpSlkAgwPZBYCZg8VAwoyMDE2LzA1LzMxBjIzNTM0NTHmi43ljZbkvJrmiJDkuqTnu5PmnpzkuIDop4jooagoMjAxNuW5tDA15pyIMzHml6UpZAIND2QWAmYPFQMKMjAxNi8wNS8yNQYyMzM2MjUx5ouN5Y2W5Lya5oiQ5Lqk57uT5p6c5LiA6KeI6KGoKDIwMTblubQwNeaciDI15pelKWQCDg9kFgJmDxUDCjIwMTYvMDUvMTkGMjMwNTM1MeaLjeWNluS8muaIkOS6pOe7k+aenOS4gOiniOihqCgyMDE25bm0MDXmnIgxOeaXpSlkAgMPDxYEHgtSZWNvcmRjb3VudAKpAR4QQ3VycmVudFBhZ2VJbmRleAIBZGRknBpMzCh1lVAb0hQ+KYqaC3XY/ObAUBQzQyX+ubYfCdU=',
                '__EVENTTARGET': 'Pager',
                '__EVENTARGUMENT': p,
            }
            self.headers["User-Agent"] = fake_useragent()
            browser = requests.post(self.post_url,
                                    headers=self.headers,
                                    data=urlencode(data))
            if browser.status_code == 200:
                html = lxml.html.fromstring(browser.text)
                lis = html.xpath('//div[@class="list1"]/li')
                for li in lis:
                    href = li.xpath('./a')[0].attrib["href"]
                    url = href.replace('..', '')

                    self.crawl(self.base_url + url)
            else:
                print("Error crawling page {}".format(p))
Exemplo n.º 35
0
    def crawl(self):
        for p in range(2, 39):
            print("====== Processing page {0} ======".format(p))

            headers = {
                "User-Agent": fake_useragent()
            }
            browser = requests.get(self.url.format(p), headers=headers)

            if browser.status_code == 200:
                root = lxml.html.fromstring(browser.text)
                lis = root.xpath('//div[@class="right_message"]/div[@class="right_hy"]/ul/li')
                for li in lis:
                    data = {
                        "name": li.xpath('.//div[@class="right_hy_into_name"]')[0].text_content().strip(),
                        "addr": li.xpath('.//div[@class="right_hy_into_zydz"]/span[1]')[0].text_content().strip(),
                        "tel": li.xpath('.//div[@class="right_hy_into_zydz"]/span[2]')[0].text_content().strip(),
                        "operation": li.xpath('.//div[@class="right_hy_into_zydz"]/span[3]')[0].text_content().strip(),
                    }
                    self.save2db(data)
            else:
                print("Error when crawling page {0}".format(p))

            time.sleep(random.randint(1, 2))
Exemplo n.º 36
0
    def __init__(self):
        # Init mysql
        self.conn = mysql.connector.connect(**self.config)
        self.cursor = self.conn.cursor()

        self.header = {"User-Agent": fake_useragent()}
Exemplo n.º 37
0
    def crawl(self, start, end):
        for p in range(start, end + 1):
            headers = {
                "User-Agent": fake_useragent()
            }
            browser = requests.get(self.url_template.format(p), headers=headers)
            browser.encoding = "utf-8"
            if browser.status_code == 200:
                html = lxml.html.fromstring(browser.text)
                lis = html.xpath('//ul[@class="tabsContainer_ul"]/li')

                # Universities of current page
                for li in lis:
                    # University url like:
                    # http://kaoshi.edu.sina.com.cn/college/c/10001.shtml
                    url = li.xpath('./a')[0].attrib["href"]

                    # Extract university id from url
                    match = re.findall(r'/(\d+)\.shtml', url)
                    uid = int(match[0]) if match else 0

                    divs = li.xpath('.//div[@class="clearfix"]')

                    # Extract name, weibo
                    links = divs[0].xpath('.//a')
                    name = links[0].text_content().strip()
                    num = len(links)
                    weibo_official = "-"
                    weibo_enrollment_office = "-"
                    for n in range(1, num):
                        text = links[n].text_content().strip()
                        if text == "官方微博":
                            weibo_official = links[n].attrib["href"]

                        if text == "招办微博":
                            weibo_enrollment_office = links[n].attrib["href"]

                    # Extract info
                    ps = divs[1].xpath('.//p')
                    location = ps[0].text_content().strip().split(':')[1].strip()
                    utype = ps[2].text_content().strip().split(':')[1].strip()
                    subject_to = ps[4].text_content().strip().split(':')[1].strip()

                    tmp = re.findall(r'(\d+)', ps[1].text_content().strip())

                    key_discipline = "-" if len(tmp) == 0 else tmp[0]

                    tmp = re.findall(r'(\d+)', ps[3].text_content().strip())
                    master = "-" if len(tmp) == 0 else tmp[0]

                    tmp = re.findall(r'(\d+)', ps[5].text_content().strip())
                    doctor = "-" if len(tmp) == 0 else tmp[0]

                    # Extract tags
                    tags = []
                    spans = divs[2].xpath('.//span')
                    for span in spans:
                        tags.append(span.text_content().strip())

                    doc = {
                        "uid": uid,  # 学校ID
                        "name": name,  # 学校名称
                        "weibo_official": weibo_official,  # 官方微博
                        "weibo_enrollment_office": weibo_enrollment_office,  # 招办微博
                        "location": location,  # 所在地
                        "utype": utype,  # 类别
                        "subject_to": subject_to,  # 隶属
                        "key_discipline": key_discipline,  # 重点学科
                        "master": master,  # 硕士点数
                        "doctor": doctor,  # 博士点数
                        "tags": tags,  # 标签
                        "url": url,  # 第二次抓取用url
                    }
                    self.collection.insert_one(doc)

                print("Page{:>6}: [done]".format(p))
                print("Page{:>6}: [done]".format(p), file=self.log)
            else:
                print("Page{:>6}: [fail]".format(p))
                print("Page{:>6}: [fail]".format(p), file=self.log)

        # Close file handler
        self.log.close()