Python Job 예제들, spider.spider_modules.job.Job Python 예제들

예제 #1

0

파일 보기

        for li_tag in div.find_all("li"):

            a_tag = li_tag.find("a")["href"]
            url_n = url + a_tag[1:]

            date = li_tag.find("span").text
            result = self.url_increment.is_increment(url_n, date)
            if result:
                urls.append(url_n)
        return urls

class thrid(ss.ThreadingSpider):
    def get(self,url):
        data = requests.get(url)
        data.encoding = 'utf-8'
        data = data.text
        soup = BeautifulSoup(data, "html.parser")
        [s.extract() for s in soup('script')]
        [s.extract() for s in soup('style')]
        title = soup.find("h1").get_text().strip().replace("\n", "")
        date_div = soup.find("div", class_="detail_bz")
        date = date_div.find("span").get_text().strip().replace("\n", "")[6:]
        text = soup.find("div", class_="detail_con").get_text().strip().replace("\n", "")
        line = url + "##" + date + "##" + title + "##" + text + "\n"
        return line

if __name__ == '__main__':
    j = job.Job("cnpiec_21")
    j.submit("first","second","thrid",pyname="cnpiec_21")
    # j.clear_schedule()

예제 #2

0

파일 보기

파일: cnpiec_31.py 프로젝트: hrl13260130208/spider

        return line


def test():
    url = "http://soeasycenter.com/newTender"

    parm = {
        "periodTime": " 0.0",
        "pageNum": "1",
        "pageSize": "500",
    }

    data = requests.post(url, data=parm)
    data.encoding = "utf-8"
    data = data.text

    soup = BeautifulSoup(data, "html.parser")

    table = soup.find("table", class_="table table-striped")
    [s.extract() for s in table('thead')]
    for tr_tag in table.find_all("tr"):
        a_tag = tr_tag.find("a")
        url_n = "http://soeasycenter.com" + a_tag["href"]
        date = tr_tag.find_all("td")[3].text
        print(url_n, date)


if __name__ == '__main__':
    j = job.Job("cnpiec_31")
    j.submit("first", "thrid", pyname="cnpiec_31")

예제 #3

0

파일 보기

파일: cnpiec_40.py 프로젝트: hrl13260130208/spider

    table_tag = soup.find("table", id="tblInfo")
    td_tag = table_tag.find("td", id="tdTitle")
    t_font = td_tag.find("font", style="font-size: 25px")
    d_font = td_tag.find("font", class_="webfont")
    title = t_font.text.strip()
    d_line = d_font.text.strip()
    end = d_line.find("】")
    date = d_line[6:end].strip().replace("/", "-")
    start = data.find(
        '<table cellspacing="0" cellpadding="0" border="0" style="border-width:0px;width:748px;border-collapse:collapse;">'
    )
    end = data.find('</table></body>')
    text = data[start:end]
    p = re.compile('(?<=\>).*?(?=\<)')
    result = p.findall(text)
    text = "".join(result).replace("&nbsp;", "")
    if text == "":
        div = table_tag.find("div", class_="infodetail")
        print(div)
        text = div.text
    text = "".join(text.split())
    print(text == "")
    line = url + "##" + date + "##" + title + "##" + text + "\n"
    print(line)
    return line


if __name__ == '__main__':
    j = job.Job("cnpiec_40")
    j.submit("first", "second", "thrid", pyname="cnpiec_40")

예제 #4

0

파일 보기

파일: cnpiec_17.py 프로젝트: hrl13260130208/spider

        [s.extract() for s in div_tag('style')]
        title = div_tag.find_all("h1")[0].get_text().strip().replace("\n", "")

        date = div_tag.find_all("span",
                                class_="Blue")[-2].get_text().strip().replace(
                                    "\n", "")
        text = ""
        script = soup.find_all("script")
        for s in script:
            str = s.get_text()
            f = re.search("jQuery\(document\).ready\(function", str)
            if f:
                int = re.search('\$\.get\("/webfile.*\.htm"', str).span()
                t_url = "http://www.hngp.gov.cn" + str[int[0] + 7:int[1] - 1]

                t_data = requests.get(t_url)
                t_data.encoding = 'utf-8'
                t_data = t_data.text

                t_soup = BeautifulSoup(t_data, "html.parser")
                [s.extract() for s in t_soup('style')]
                text = t_soup.get_text().strip()
                text = "".join(text.split())
        line = url + "##" + date + "##" + title + "##" + text + "\n"
        return line


if __name__ == '__main__':
    j = job.Job("cnpiec_17")
    j.submit("first", "second", "thrid", pyname="cnpiec_17")

예제 #5

0

파일 보기

파일: cnpiec_47.py 프로젝트: hrl13260130208/spider

class thrid(ss.ThreadingSpider):
    def get(self, url):
        resq = requests.get(url)
        resq.encoding = "UTF-8"
        data = resq.text
        soup = BeautifulSoup(data, "html.parser")
        div_tag = soup.find("div", class_="W980 Center PaddingTop10")
        title = div_tag.find("h1").text.strip()

        div_tag2 = div_tag.find(
            "div", class_="Padding10 TxtCenter Gray").text.strip()
        s_num = div_tag2.find("发布时间：")
        e_num = div_tag2.find("浏览次数：")
        dt = div_tag2[s_num + 5:e_num].strip()
        date = dt.split(" ")[0]
        start = data.find(
            '<div class="Contnet" style="min-height:500px; padding:0 30px;">')
        end = data.find('<ul style="text-align:center; padding:10px;">')
        text = data[start:end]
        p = re.compile('(?<=\>).*?(?=\<)')
        result = p.findall(text)
        text = "".join(result)
        line = url + "##" + date + "##" + title + "##" + text + "\n"
        return line


if __name__ == '__main__':
    j = job.Job("cnpiec_47")
    j.submit("first", "second", "thrid", pyname="cnpiec_47")

예제 #6

0

파일 보기

파일: cnpiec_45.py 프로젝트: hrl13260130208/spider

    url="http://new.zmctc.com/zjgcjy/InfoDetail/?InfoID=9329daf9-0310-4ded-bee7-3dd6fca0ae35&CategoryNum=004001001"
    header = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"}
    resq=requests.get(url,headers=header)
    resq.encoding = "UTF-8"
    data=resq.text
    soup=BeautifulSoup(data,"html.parser")
    # [s.extract() for s in soup("style")]
    # print(soup.text)
    table_tag=soup.find("table",id="tblInfo")
    td_tag=table_tag.find("td",id="tdTitle")
    t_font=td_tag.find("font",style="font-size: 25px")
    d_font=td_tag.find("font",class_="webfont")
    title=t_font.text.strip()
    d_line=d_font.text.strip()
    end=d_line.find("】")
    date=d_line[6:end].strip().replace("/","-")
    text="".join(table_tag.text.split())
    line = url + "##" + date + "##" + title + "##" + text + "\n"
    return line






if __name__ == '__main__':
    j = job.Job("cnpiec_45")
    j.submit("first","second","thrid",pyname="cnpiec_45")

예제 #7

0

파일 보기

            continue
        url_n = "http://zbxx.ycit.cn" + a["href"]
        print(url_n, date.text.strip())

    return urls


def test2():
    url = "http://zbxx.ycit.cn/zbxx/ShowArticle.asp?ArticleID=768"

    resq = requests.get(url)
    resq.encoding = "gbk"
    data = resq.text
    soup = BeautifulSoup(data, "html.parser")
    table_tag = soup.find("table", width="1004", height="462")
    td = table_tag.find("td", width="1000")
    table = td.find("table")
    title = table.find("td", class_="wzrr").text.strip()
    d_td = table.find("tr", align="middle").text.strip()
    start = d_td.find("更新时间：")
    date = d_td[start + 5:]

    text = "".join(table_tag.text.split())
    line = url + "##" + date + "##" + title + "##" + text + "\n"
    return line


if __name__ == '__main__':
    j = job.Job("cnpiec_41")
    j.submit("first", "second", "thrid", pyname="cnpiec_41")

예제 #8

0

파일 보기

class thrid(ss.ThreadingSpider):
    def get(self, url):
        header = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)"
            " Chrome/70.0.3538.102 Safari/537.36",
            "Connection":
            "keep - alive",
            "Cookie":
            '__jsluid=fda97a093bd3c210c560f9ab4ecb80dd; reg_referer="aHR0cDovL3d3dy5iaWRjaGFuY2UuY29tLw=="; Hm_lvt_2751005a6080efb2d39109edfa376c63=1546582829; bdshare_firstime=1546582832885; Cookies_Userid=42k6u0p1egikh7r0p2b32ujavq0nu79; JSESSIONID=B6FEB4E331F0946C6D62E44BE4855196; Cookies_Key=-3k1utnkf0g7gt5lte5pf4tu1um04u2el56kbnh90lmlnl35fd4ti94uqg7bcbrci; Cookies_token=0dd33210-eaf5-4e53-bcd5-c0a4c5a80b5b; Hm_lpvt_2751005a6080efb2d39109edfa376c63=1546590789'
        }
        time.sleep(5)
        resq = requests.get(url, headers=header)
        soup = BeautifulSoup(resq.text, "html.parser")

        title = soup.find("div", class_="xlh").text.strip()
        div_tag = soup.find("div", class_="xllabel-l")
        date = div_tag.find("span", id="infopubdate").text.strip()
        text = soup.find("div", class_="xlbodybox").text
        text = "".join(text.split())
        line = url + "##" + date + "##" + title + "##" + text + "\n"
        return line


if __name__ == '__main__':
    # test()
    j = job.Job("cnpiec_48")
    j.set_speed()
    j.submit("first", "second", "thrid", pyname="cnpiec_48")

예제 #9

0

파일 보기

파일: cnpiec_5.py 프로젝트: hrl13260130208/spider

        headers={
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': "zh-CN,zh;q=0.9",
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            'Host': 'cz.fjzfcg.gov.cn',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': ua
        },
        cookies=cookies)
    print(data.status_code)
    data.encoding = 'utf-8'
    data = data.text
    print(data)
    soup = BeautifulSoup(data, "html.parser")
    div_tag = soup.find("div", class_="wrapTable")
    tbody = div_tag.find("tbody")
    for tr in tbody.find_all("tr"):
        a_tag = tr.find("a")
        url_n = "http://cz.fjzfcg.gov.cn" + a_tag["href"]
        date = tr.find_all("td")[1].text
        print(url_n, date)


if __name__ == '__main__':
    j = job.Job("cnpiec_5")
    j.submit("first", "second", "thrid", pyname="cnpiec_5")
    # j.clear_schedule()

예제 #10

0

파일 보기

            date = tr_tag.find_all("td")[2].text[1:-1]
            result = self.url_increment.is_increment(url_n, date)
            if result:
                urls.append(url_n)
        return urls


class thrid(ss.ThreadingSpider):
    def get(self, url):
        data = requests.get(url)
        data.encoding = 'gb2312'
        data = data.text
        soup = BeautifulSoup(data, "html.parser")
        table_tag = soup.find_all("table", width="887")[0]
        title = table_tag.find_all("td",
                                   height="76")[0].get_text().strip().replace(
                                       "\n", "")
        date = table_tag.find_all("td",
                                  height="30")[0].get_text().strip().replace(
                                      "\n", "")[10:19]
        text = table_tag.find_all(
            "td", style="padding:26px 40px 10px;")[0].get_text().strip()
        text = "".join(text.split())
        line = url + "##" + date + "##" + title + "##" + text + "\n"
        return line


if __name__ == '__main__':
    j = job.Job("cnpiec_16")
    j.submit("first", "second", "thrid", pyname="cnpiec_16")

예제 #11

0

파일 보기

    def get(self, url):
        pass


class thrid(ss.ThreadingSpider):
    def get(self, url):
        resq = requests.get(url)
        resq.encoding = "UTF-8"
        data = resq.text
        jsons = json.loads(data)
        title = jsons["noticeTitle"].replace("\n", "")
        n_date = jsons["noticePubDate"]
        date = n_date.split(" ")[0]
        content = jsons["noticeContent"]
        soup = BeautifulSoup(content, "html.parser")
        [s.extract() for s in soup("style")]
        text = soup.text
        text = "".join(text.split())
        result = self.url_increment.is_increment(url, date)
        if result:
            line = url + "##" + date + "##" + title + "##" + text + "\n"
            return line
        else:
            return self.attr.DONE


if __name__ == '__main__':

    j = job.Job("cnpiec_46")
    j.submit("first", "thrid", pyname="cnpiec_46")

예제 #12

0

파일 보기

파일: cnpiec_25.py 프로젝트: hrl13260130208/spider

            return self.attr.DONE
        soup = BeautifulSoup(data, "html.parser")
        dl_tag = soup.find("dl", class_="llist")
        for dd_tag in dl_tag.find_all("dd", cid="4"):
            url_n = dd_tag.find("a")["href"]
            date = dd_tag.find("span").text
            result = self.url_increment.is_increment(url_n, date)
            if result:
                urls.append(url_n)
        return urls

class thrid(ss.ThreadingSpider):
    def get(self,url):
        data = requests.get(url)
        data.encoding = 'utf-8'
        data = data.text
        soup = BeautifulSoup(data, "html.parser")
        tag = soup.find("div", class_="lright cright")
        ctitle = tag.find("div", class_="ctitle")
        text = tag.find_all(attrs={'class': 'ccontent'})[0].get_text().strip()
        title = tag.find("h1").get_text()
        date = ctitle.find("i").get_text().strip()[6:]
        text = "".join(text.split())
        line = url + "##" + date + "##" + title + "##" + text + "\n"
        return line

if __name__ == '__main__':
    j = job.Job("cnpiec_25")
    j.submit("first","second","thrid",pyname="cnpiec_25")

예제 #13

0

파일 보기

        text = text.get_text().strip()
        text = "".join(text.split())
        line = url + "##" + date + "##" + title + "##" + text + "\n"
        return line


def test():
    url = "http://www.njgp.gov.cn/cgxx/cggg/jzcgjg/index.html"
    data = requests.get(url)
    data.encoding = 'utf-8'
    data = data.text
    nums = re.search("index", url).span()
    prefix = url[:nums[0]]

    soup = BeautifulSoup(data, "html.parser")

    div_tag = soup.find("div", class_="R_cont_detail")
    for li_tag in div_tag.find_all("li"):
        a_tag = li_tag.find("a")
        url_t = a_tag["href"]
        url_n = prefix + url_t[2:]
        [s.extract() for s in li_tag("a")]
        date = li_tag.text.strip()
        print(url_n, date)


if __name__ == '__main__':
    j = job.Job("cnpiec_26")
    j.submit("first", "second", "thrid", pyname="cnpiec_26")

예제 #14

0

파일 보기

파일: cnpiec_23.py 프로젝트: hrl13260130208/spider

            date = li_tag.find("span").text
            print(url_n, date)
            result = self.url_increment.is_increment(url_n, date)
            if result:
                urls.append(url_n)
        return urls


class thrid(ss.ThreadingSpider):
    def get(self, url):
        data = requests.get(url)
        data.encoding = 'utf-8'
        data = data.text
        soup = BeautifulSoup(data, "html.parser")
        div = soup.find_all("div", class_="article-info")[0]
        [s.extract() for s in div('script')]
        [s.extract() for s in div('style')]
        title = div.find_all("h1")[0].get_text().strip().replace("\n", "")
        date = div.find_all("p",
                            class_="infotime")[0].get_text().strip().replace(
                                "\n", "")
        text = div.find_all("div")[0].get_text().strip()
        text = "".join(text.split())
        line = url + "##" + date + "##" + title + "##" + text + "\n"
        return line


if __name__ == '__main__':
    j = job.Job("cnpiec_23")
    j.submit("first", "second", "thrid", pyname="cnpiec_23")

예제 #15

0

파일 보기

파일: cnpiec_30.py 프로젝트: hrl13260130208/spider

        text = tag.get_text().strip()
        text = "".join(text.split())
        line = url + "##" + date + "##" + title + "##" + text + "\n"
        return line


def test():
    url = "http://www.ccgp-qinghai.gov.cn/jilin/zbxxController.form?declarationType=&type=1&pageNo=1"
    data = requests.get(url)
    data.encoding = 'utf-8'
    data = data.text

    soup = BeautifulSoup(data, "html.parser")

    div_tag = soup.find("div", class_="m_list_3")
    for li_tag in div_tag.find_all("li"):
        a_tag = li_tag.find("a")
        url_n = a_tag["href"]
        date = li_tag.find("span").text
        date = date.replace("年", "-")
        date = date.replace("月", "-")
        date = date.replace("日", "")
        print(url_n, date)


if __name__ == '__main__':

    j = job.Job("cnpiec_30")
    j.submit("first", "second", "thrid", pyname="cnpiec_30")
    # j.clear_schedule()