示例#1
0
    def parse(self, response):
        '''

        :param response:
        :return:

        取各版块的url并传入到parse_pages

        '''
        self.log('Hi, this is an item page! %s' % response.url)
        # root = bs(response.body)
        #
        # #use navigation
        # curls = []
        # divs = root.findAll("div", attrs={"class": "title"})
        # for div in divs:
        #     try:
        #         url = div.find("span").find("a", attrs={"href": re.compile("^NewsList")})
        #         curls.append(url.get("href"))
        #     except:
        #         pass
        #
        # for url in curls:
        #     yield scrapy.Request(root_domain + "/News/" + url, self.parse_pages)
        item = FenghuoItem()
        item['articles'] = []
        c_item = FenghuoItem()
        c_item['url'] = '1233'
        b_item = FenghuoItem()
        b_item['url'] = '3333'
        item['articles'].append(dict(c_item))
        item['articles'].append(dict(b_item))
        line = json.dumps(dict(item)) + '\n'
        print line
示例#2
0
    def parse_item(self, response):
        '''
        访问各新闻页面,获取各键值
        :param response:
        :return:
        '''
        self.log('Hi, this is an item page! %s' % response.url)
        item = FenghuoItem()
        root = bs(response.body)
        item['topPost'] = 1
        item["site_id"] = 13
        item['website_id'] = ''
        item["site_name"] = '通山县机构编制网'
        item["area"] = 958
        item["site_weight"] = 2
        item['countryid'] = 1156
        item['province'] = 1673
        item['city'] = 136
        item["ip"] = socket.gethostbyname("www.tsxbb.gov.cn")
        item["site_url"] = "www.tsxbb.gov.cn"
        item["forumurl"] = response.meta['forumurl']
        item["site_type"] = '新闻'
        item["url"] = response.url
        item["subname"] = root.find("span", attrs={
            "class": "text14h"
        }).find("a", attrs={
            "href": "../"
        }).text
        item["title"] = root.find("td", attrs={"class": "textbiaoti"}).text
        str = root.find("td", attrs={"class": "text12hui"}).text
        str = str[str.index('20'):]
        item["pubdate"] = str[:str.index(' ') - 1]
        try:
            str = str[str.index('su = ') + 6:]
            item["website_id"] = str[:str.index(';') - 1]
        except:
            item["website_id"] = ""
        styles = root.find("div", attrs={
            "class": "TRS_Editor"
        }).findAll("style")
        for style in styles:
            style.clear()
        #替换所有图片标签
        imgs = root.find("div", attrs={"class": "TRS_Editor"}).findAll("img")
        for img in imgs:
            img.replaceWith(img.prettify())

        item["txt"] = root.find("div", attrs={
            "class": "TRS_Editor"
        }).text.replace("\r\n",
                        "$*huanhang*$").replace("\n", "$*huanhang*$").replace(
                            "\"", "'").replace("<br />", "$*huanhang*$")
        item["txt_len"] = len(item["txt"])
        item["domain_1"] = "tsxbb.gov.cn"
        item["domain_2"] = "www"
        item["snatch_time"] = datetime.datetime.now()
示例#3
0
    def parse(self, response):
        '''

        :param response:
        :return:

        取各版块的url并传入到parse_pages

        '''
        #test
        # url = 'http://m.sohu.com/cl/58/?page=1'
        # yield scrapy.Request(url, self.parse_item)
        #end test
        file_name = "log-" + str(datetime.date.today()) + ".txt"
        t_file = codecs.open(file_name, 'ab', encoding='utf-8')
        if self.isFirst:
            self.isFirst = False
            line = str(datetime.datetime.now()) + " " + self.name + " spider start!\n"
            t_file.write(line.decode("unicode_escape"))

        updatetool = UpdateTool()
        self.log('Hi, this is an page! %s' % response.url)
        self.new = 0
        self.pages += 1
        root = bs(response.body.decode('utf-8'))
        div = root.find("div", attrs={"class": "bd3 pb1"})
        lis = div.findAll("p")
        for li in lis:
            item = FenghuoItem()
            iurl = 'm.sohu.com'+li.find("a").get("href")
            title = li.find("a").text
            pubdate = root.find('p',attrs={'class': 'w c2'}).text
            month = pubdate[16:18]
            day = pubdate[19:21]
            hour = pubdate[22:24]
            year = pubdate[11:15]
            item_date = datetime.date(int(year), int(month), int(day))
            item['url'] = iurl
            item['title'] = title
            item['pubdate'] = str(item_date)
            item['snatch_time'] = datetime.datetime.now()
            item['topPost'] = 1
            item['site_name'] = '手机搜狐网'
            item['site_url'] = "m.sohu.com/"
            print item
            if (not updatetool.hasUrl(iurl)) and self.now - item_date < self.delay:
                self.new += 1
                self.total_new += 1
                fp.process_item(item, "123")
        url = 'http://m.sohu.com/cl/58/?page='+str(self.pages)
        if self.new > 3 and self.hasNext:
            yield scrapy.Request(url, self.parse)
        else:
            line = str(datetime.datetime.now()) + " Totally crawled " + str(self.total_new) + " items " + self.name + " spider has finished start!\n\n"
            t_file.write(line.decode("unicode_escape"))
示例#4
0
    def parse_items(self,response):

        #title done
        #txt:main work,focus on parse HTML and remove impurity
        #pubdate done
        #snatch_time static
        #site_url static
        #site_name static
        #url done
        #topPost static
        #url done

        #init:
        html_parser = HTMLParser.HTMLParser()
        item = FenghuoItem()
        url = response.url

        self.log("Hi,this is in parse_items,url is %s" % url)
        root = bs(response.body)
        div=root.find("div","ny_con news_con_ny")
        pubdate0 = div.find("p","news_time").text
        year=pubdate0[3:7]
        month=pubdate0[8:10]
        day=pubdate0[11:13]
        hour='00'
        pubdate=str(year) + "-" + month + "-" + day + " " + hour + ":00"
        ps = div.findAll("p","MsoNormal")
        title = div.find("h3").text
        html = ""
        #see if there's rubbish in ps remove:
        print ps[-1]
        if ps[-1].find("div","page"):
            del ps[-1]
        print 'sssssssssssssss'
        # get txt by paragraph
        for p in ps:
            #remove comments in text:
            comments = p.findAll(text=lambda text:isinstance(text,Comment))
            [comment.extract() for comment in comments]
            html = html + '\n' + p.text.encode('utf-8')
        text = html_parser.unescape(html)
        item['url'] = url
        item['title'] = html_parser.unescape(title)
        item['txt'] = text
        item['pubdate'] = str(pubdate)
        item['snatch_time'] = datetime.datetime.now()
        item['topPost'] = 1
        item['site_name'] = '武汉大学研究生院'
        item['site_url'] = "www.gs.whu.edu.cn/"
        f=open('scrapy_log.txt','a')
        f.write(html_parser.unescape(title)+'\n'+str(pubdate)+'\n')
        f.close()
        fp.process_item(item, "123")
示例#5
0
    def parse_item(self, response):
        '''
        访问各新闻页面,获取各键值
        :param response:
        :return:
        '''
        self.log('Hi, this is an item page! %s' % response.url)
        item = FenghuoItem()
        root = bs(response.body)

        try:
            item['topPost'] = 1
            item["site_id"] = 17
            item['website_id'] = ''
            item["site_name"] = '襄城新闻网'
            item["area"] = 958
            item["site_weight"] = 2
            item['countryid'] = 1156
            item['province'] = 1673
            item['city'] = 136
            item["ip"] = socket.gethostbyname("www.xcxww.com")
            item["site_url"] = "www.xcxww.com"
            types = root.find("div", attrs={"class": "pagenav"}).findAll("a")
            item["forumurl"] = types[len(types) - 1].get("href")
            item["site_type"] = '新闻'
            item["domain_1"] = "xcxww.com"
            item["domain_2"] = "www"
            item["url"] = response.url
            types = root.find("div", attrs={"class": "pagenav"}).findAll("a")
            item["subname"] = types[len(types) - 1].text
            item["pubdate"] = root.find("div", attrs={
                "class": "info"
            }).find("span").text
            #替换所有图片标签
            imgs = root.find("div", attrs={"class": "content"}).findAll("img")
            for img in imgs:
                img.replaceWith(img.prettify())
            item["txt"] = root.find("div", attrs={
                "class": "content"
            }).text.replace("\r\n", "$*huanhang*$").replace(
                "\n",
                "$*huanhang*$").replace("\"",
                                        "'").replace("<br />", "$*huanhang*$")
            item["txt_len"] = len(item["txt"])
            item["title"] = root.find("h1").text
            item["snatch_time"] = datetime.datetime.now()
            return item
        except:
            #errors are log in error1.json with url
            line = response.url + "\n"
            self.file = codecs.open('error1.json', 'ab', encoding='utf-8')
            self.file.write(line.decode("unicode_escape"))
            pass
示例#6
0
    def parse_items(self, response):

        #title done
        #txt:main work,focus on parse HTML and remove impurity
        #pubdate done
        #snatch_time static
        #site_url static
        #site_name static
        #url done
        #topPost static
        #url done

        #init:
        html_parser = HTMLParser.HTMLParser()
        item = FenghuoItem()
        url = response.url

        self.log("Hi,this is in parse_items,url is %s" % url)
        root = bs(response.body)
        div = root.find("div", "content-box clear")
        pubdate0 = div.find("div", "time").text
        year = pubdate0[0:4]
        month = pubdate0[5:7]
        day = pubdate0[8:10]
        hour = pubdate0[11:13]
        minute = pubdate0[14:16]
        pubdate = str(
            year) + "-" + month + "-" + day + " " + hour + ":" + minute
        ps = div.findAll("p")
        title = div.find("h1").text
        html = ""
        #see if there's rubbish in ps remove:
        print ps[-1]
        if ps[-1].find("div", "page"):
            del ps[-1]
        print 'sssssssssssssss'
        # get txt by paragraph
        for p in ps:
            #remove comments in text:
            comments = p.findAll(text=lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]
            html = html + '\n' + p.text.encode('utf-8')
        text = html_parser.unescape(html)
        item['url'] = url
        item['title'] = html_parser.unescape(title)
        item['txt'] = text
        item['pubdate'] = str(pubdate)
        item['snatch_time'] = datetime.datetime.now()
        item['topPost'] = 1
        item['site_name'] = '搜狐国内新闻'
        item['site_url'] = "news.sohus.com/"
        # see if the txt of this page has a next page
        fp.process_item(item, "123")
示例#7
0
    def parse(self, response):
        '''

        :param response:
        :return:

        取各版块的url并传入到parse_pages

        '''
        #test
        # url = 'http://www.gs.whu.edu.cn/index.php/index-show-tid-40-p-1.html'
        # yield scrapy.Request(url, self.parse_item)
        #end test
        file_name = "log-" + str(datetime.date.today()) + ".txt"
        t_file = codecs.open(file_name, 'ab', encoding='utf-8')
        if self.isFirst:
            self.isFirst = False
            line = str(datetime.datetime.now()) + " " + self.name + " spider start!\n"
            t_file.write(line.decode("unicode_escape"))

        updatetool = UpdateTool()
        self.log('Hi, this is an page! %s' % response.url)
        self.new = 0
        self.pages += 1
        root = bs(response.body.decode('utf-8'))
        div = root.find("div", attrs={"class": "ulnotice"})
        lis = div.findAll("li")
        for li in lis:
            item = FenghuoItem()
            iurl = li.find("a").get("href")
            if iurl[0:4]!='http':
                iurl='http://gs.whu.edu.cn'+iurl
            title = li.find("a").text
            pubdate = li.find("span").text
            month = pubdate[6:8]
            day = pubdate[9:11]
            hour = '00'
            year = pubdate[1:5]
            item_date = datetime.date(int(year), int(month), int(day))
            if (not updatetool.hasUrl(iurl)) and self.now - item_date < self.delay:
                self.new += 1
                self.total_new += 1
                yield scrapy.Request(iurl, self.parse_items)#according to iurl,requuest the detail page
        url = 'http://www.gs.whu.edu.cn/index.php/index-show-tid-40-p-'+str(self.pages)+'.html'
        if self.new > 10 and self.hasNext:
            yield scrapy.Request(url, self.parse)
        else:
            line = str(datetime.datetime.now()) + " Totally crawled " + str(self.total_new) + " items " + self.name + " spider has finished start!\n\n"
            t_file.write(line.decode("unicode_escape"))
示例#8
0
    def parse_page(self, response):
        self.log('Hi, this is an list page! %s' % response.url)
        root = bs(response.body.decode('gbk'))
        table = root.find("div", id="threadlist")
        trs = table.findAll("tr")
        for tr in trs:
            item = FenghuoItem()
            item["domain_1"] = "437600.net"
            item["domain_2"] = "www"
            item["site_id"] = 12
            item['website_id'] = ''
            item["site_name"] = '通山信息港'
            item["area"] = 958
            item["site_weight"] = 2
            item['countryid'] = 1156
            item['province'] = 1673
            item['city'] = 136
            item["ip"] = socket.gethostbyname("www.437600.net")
            item["site_url"] = "www.437600.net"
            item["forumurl"] = response.meta['forumurl']
            item["site_type"] = '论坛'
            item["snatch_time"] = datetime.datetime.now()
            try:
                item["url"] = root_domain + tr.find(
                    "a", attrs={
                        "href": re.compile("^thread")
                    }).get("href")
                item["title"] = tr.find("th").find("a",
                                                   attrs={
                                                       "href":
                                                       re.compile("^thread")
                                                   }).text
                item["author"] = tr.find("td", attrs={
                    "class": "by"
                }).find("cite").find("a").text
                item["userpage"] = root_domain + tr.find(
                    "td", attrs={
                        "class": "by"
                    }).find("cite").find("a").get("href")
                url_id = tr.findAll(
                    "td", attrs={"class":
                                 "by"})[0].find("cite").find("a").get("href")
                url_id = url_id[url_id.index("uid-") + 4:]
                item["userid"] = url_id[:url_id.index(".html")]
                try:
                    item["reply"] = tr.find("td", attrs={
                        "class": "num"
                    }).find("a").text
                except:
                    item["reply"] = ""
                try:
                    item["view"] = tr.find("td", attrs={
                        "class": "num"
                    }).find("em").text
                except:
                    item["view"] = ""

                try:
                    item["postid"] = tr.findAll(
                        "td", attrs={"class":
                                     "by"})[1].find("cite").find("a").text
                except:
                    item["postid"] = ""

                try:
                    item["subname"] = root_domain + tr.find("th").find(
                        "em").find("a").text
                except:
                    item["subname"] = ""

                try:
                    time1 = tr.findAll("td",
                                       attrs={"class":
                                              "by"})[1].find("em").find("a")
                    try:
                        item["updatetime"] = time1.find("span").get("title")
                    except:
                        item["updatetime"] = time1.text

                except:
                    item["updatetime"] = ""
                #explore the content of the page
                yield scrapy.Request(item["url"],
                                     self.parse_item,
                                     meta={'item': item})

            except Exception as e:
                #print e
                #print tr
                line = str(tr) + "\n"
                self.file = codecs.open('error1.json', 'ab', encoding='utf-8')
                self.file.write(line.decode("unicode_escape"))
示例#9
0
    def parse_item(self, response):
        item = response.meta['item']
        item['articles'] = []
        new_root = bs(response.body.decode('gbk'))
        subnames = new_root.find("div", id="pt").findAll("a")
        item["subname"] = subnames[len(subnames) - 2].text
        try:
            pageText = new_root.find("div", attrs={
                "class": "pg"
            }).find("span").text

            pageText = pageText[pageText.index("/") + 2:]
            pageText = pageText[:pageText.index(" ")]
        except:
            pageText = 1
        url = response.url
        try:
            for page in range(1, int(pageText) + 1):
                r1 = requests.post(url)
                new_root = bs(r1.text)
                divs = new_root.findChildren(
                    'div', attrs={"id": re.compile("^post_[0-9]")})
                for div in divs:
                    c_item = FenghuoItem()
                    c_item["topPost"] = 0
                    c_item['author'] = div.find("table").find(
                        "td", attrs={
                            "class": "pls"
                        }).find("div", attrs={
                            "class": "authi"
                        }).find("a").text

                    c_item['userpage'] = div.find("table").find(
                        "td", attrs={
                            "class": "pls"
                        }).find("div", attrs={
                            "class": "authi"
                        }).find("a").get("href")

                    #获取四种pubdate结构的方式
                    try:
                        c_item["pubdate"] = div.find("table").find(
                            "td", attrs={
                                "class": "plc"
                            }).find("div", attrs={
                                "class": "authi"
                            }).find("em").find("span").get("title")

                    except:
                        try:
                            t_time = div.find("table").find(
                                "td", attrs={
                                    "class": "plc"
                                }).find("div", attrs={
                                    "class": "authi"
                                }).find("em").text
                            c_item["pubdate"] = t_time[t_time.index(" ") + 1:]

                        except:
                            try:
                                c_item["pubdate"] = div.find("table").find(
                                    "td", attrs={
                                        "class": "plc comiis_vtx"
                                    }).find("div", attrs={
                                        "class": "authi"
                                    }).find("em").find("span").get("title")

                            except:
                                try:
                                    t_time = div.find("table").find(
                                        "td",
                                        attrs={
                                            "class": "plc comiis_vtx"
                                        }).find("div",
                                                attrs={
                                                    "class": "authi"
                                                }).find("em").text
                                    c_item["pubdate"] = t_time[t_time.
                                                               index(" ") + 1:]
                                except:
                                    raise
                    try:
                        c_item["postfloor"] = div.find("table").find(
                            "td", attrs={
                                "class": "plc"
                            }).find("div", attrs={
                                "class": "pi"
                            }).find("em").text
                        if int(c_item["postfloor"]) == 1:
                            c_item["topPost"] = 1
                    except:
                        c_item["postfloor"] = div.find("table").find(
                            "td", attrs={
                                "class": "pls"
                            }).findNextSibling().find("div",
                                                      attrs={
                                                          "class": "pi"
                                                      }).find("em").text
                        if int(c_item["postfloor"]) == 1:
                            c_item["topPost"] = 1

                    # #clear css, js , advertisement and messy code
                    styles = div.findAll("style")
                    scripts = div.findAll("script")
                    for style in styles:
                        style.clear()
                    for script in scripts:
                        script.clear()
                    advs = div.findAll(
                        "div",
                        attrs={"class": "attach_nopermission attach_tips"})
                    for adv in advs:
                        adv.clear()
                    m_codes = new_root.findAll("span",
                                               attrs={"style": "display:none"})
                    for m_code in m_codes:
                        m_code.clear()
                    m_codes = new_root.findAll("font",
                                               attrs={"class": "jammer"})
                    for m_code in m_codes:
                        m_code.clear()

                    #替换所有图片标签
                    imgs = div.find("table").find("td", attrs={
                        "class": "pls"
                    }).findNextSibling().find("table").findAll("img")
                    for img in imgs:
                        img.replaceWith(img.prettify())

                    c_item["txt"] = div.find("table").find(
                        "td", attrs={
                            "class": "pls"
                        }).findNextSibling().find("table").text.replace(
                            "<br />",
                            " ").replace("\r\n",
                                         "$*huanhang*$").replace("\"", "‘")
                    c_item["txt_len"] = len(c_item["txt"])

                    #租房子页面他有bug,所以要再获取一次内容

                    if c_item["topPost"] == 1 and item["subname"] == '房屋租售':
                        try:

                            #替换所有图片标签
                            imgs = new_root.find("div",
                                                 attrs={
                                                     "class": "t_fsz"
                                                 }).findAll("img")
                            for img in imgs:
                                img.replaceWith(img.prettify())

                            c_item['txt'] += new_root.find(
                                "div", attrs={
                                    "class": "t_fsz"
                                }).text.replace("\r\n",
                                                "$*huanhang*$").replace(
                                                    "\n",
                                                    "$*huanhang*$").replace(
                                                        "\"", "'").replace(
                                                            "<br />",
                                                            "$*huanhang*$")
                            c_item["txt_len"] = len(c_item["txt"])
                        except:
                            pass
                    item['articles'].append(dict(c_item))

                try:
                    url = root_domain + new_root.find("div",
                                                      attrs={
                                                          "class": "pgbtn"
                                                      }).find("a").get("href")
                    print url
                except:
                    pass
            return item
        except:
            line = response.url + "\n"
            self.file = codecs.open('error2.json', 'ab', encoding='utf-8')
            self.file.write(line.decode("unicode_escape"))
            pass
示例#10
0
    def parse_pages(self, response):
        self.log('Hi, this is an item page! %s' % response.url)

        root_url = response.url

        last_t = "0"
        new = 21
        url = root_url
        while new > 20:
            new = 0
            r = requests.get(url)
            r.encoding = "gbk"
            root = bs(r.text)

            table = root.find("table", id="threadlisttableid")
            # print table
            try:
                trs = table.findAll("tr")
            except:
                line = response.url + "\n"
                self.file = codecs.open('error1.json', 'ab', encoding='utf-8')
                self.file.write(line.decode("unicode_escape"))
                continue
            for tr in trs:
                item = FenghuoItem()
                item["domain_1"] = "hbha.com.cn"
                item["domain_2"] = "bbs"
                item["site_id"] = 36
                item['website_id'] = ''
                item["site_name"] = '红安论坛'
                item["area"] = 3507
                item["site_weight"] = 2
                item['countryid'] = 1156
                item['province'] = 1673
                item['city'] = 2508
                item["ip"] = socket.gethostbyname("bbs.hbha.com.cn")
                item["site_url"] = "bbs.hbha.com.cn"
                item["forumurl"] = root_url
                item["site_type"] = '论坛'
                item["snatch_time"] = datetime.datetime.now()
                try:
                    item["url"] = root_domain + tr.find(
                        "a", attrs={
                            "href": re.compile("^forum")
                        }).get("href")
                    item["title"] = tr.find("th").find("a",
                                                       attrs={
                                                           "href":
                                                           re.compile("^forum")
                                                       }).text
                    item["author"] = tr.find("td", attrs={
                        "class": "by"
                    }).find("cite").find("a").text
                    item["userpage"] = root_domain + tr.find(
                        "td", attrs={
                            "class": "by"
                        }).find("cite").find("a").get("href")
                    url_id = tr.findAll("td", attrs={
                        "class": "by"
                    })[0].find("cite").find("a").get("href")
                    url_id = url_id[url_id.index("uid=") + 4:]
                    item["userid"] = url_id
                    try:
                        item["reply"] = tr.find("td", attrs={
                            "class": "num"
                        }).find("a").text
                    except:
                        item["reply"] = ""
                    try:
                        item["view"] = tr.find("td", attrs={
                            "class": "num"
                        }).find("em").text
                    except:
                        item["view"] = ""

                    try:
                        item["postid"] = tr.findAll(
                            "td", attrs={"class":
                                         "by"})[1].find("cite").find("a").text
                    except:
                        item["postid"] = ""

                    try:
                        item["subname"] = root_domain + tr.find("th").find(
                            "em").find("a").text
                    except:
                        item["subname"] = ""

                    try:
                        time1 = tr.findAll("td",
                                           attrs={"class": "by"
                                                  })[1].find("em").find("a")
                        try:
                            item["updatetime"] = time1.find("span").get(
                                "title")
                        except:
                            item["updatetime"] = time1.text

                    except:
                        item["updatetime"] = ""
                    #explore the content of the page

                    #增量的判断
                    if item["updatetime"] and item["updatetime"] > last_t:
                        new += 1
                        print item["url"]
                        yield scrapy.Request(item["url"],
                                             self.parse_item,
                                             meta={'item': item})

                    if item["updatetime"] and item["updatetime"] == last_t:
                        new += 1
                        #update before

                except Exception as e:
                    line = str(tr) + "\n"
                    self.file = codecs.open('error1.json',
                                            'ab',
                                            encoding='utf-8')
                    self.file.write(line.decode("unicode_escape"))
            try:
                url = root_domain + root.find("div", attrs={
                    "class": "pg"
                }).find("a", attrs={
                    "class": "nxt"
                }).get("href")
                print url
            except:
                break
示例#11
0
    def parse_item(self, response):
        '''
        访问各新闻页面,获取各键值
        :param response:
        :return:
        '''
        self.log('Hi, this is an item page! %s' % response.url)
        item = FenghuoItem()
        root = bs(response.body)

        try:
            item['topPost'] = 1
            item["site_id"] = 17
            item['website_id'] = ''
            item["site_name"] = '中国通山网'
            item["area"] = 958
            item["site_weight"] = 2
            item['countryid'] = 1156
            item['province'] = 1673
            item['city'] = 136
            item["ip"] = socket.gethostbyname("www.cntongshan.com")
            item["site_url"] = "www.cntongshan.com"
            item["forumurl"] = response.meta['forumurl']
            item["site_type"] = '新闻'
            item["url"] = response.url
            url = response.url
            id = url[url.index("-") + 1:]
            id = id[:id.index(".")]
            item["parent_type"] = root.find("a", attrs={
                "class": "Current"
            }).text
            item["subname"] = root.find("a",
                                        attrs={
                                            "class": "A",
                                            "target": "_blank"
                                        }).text
            str = root.find("div", attrs={
                "class": "Title_h1"
            }).find("div").text
            str1 = str[str.index('20'):]
            item["pubdate"] = str1[:str1.index('\n') - 2]
            try:
                str2 = str[str.index('作者') + 2:]
                item["author"] = str2[:str2.index("浏览")].replace(
                    "\r\n", "$*huanhang*$")
            except:
                item["author"] = ""

            #because view is get by ajax through GET, so we should use requests to get view
            #id is the id of the news , getted by url
            r = requests.get(
                "http://www.cntongshan.com/public/ajax.aspx?action=addnum&id="
                + id + "&t=4&_=1437061503826")
            item["view"] = int(r.text[:r.text.index(",")])
            item["txt"] = root.find("div", attrs={
                "class": "content_main"
            }).text.replace("\r\n", "$*huanhang*$").replace("\"", "‘")
            item["txt_len"] = len(item["txt"])
            item["title"] = root.find("h1").text
            item["domain_1"] = "cntongshan.com"
            item["domain_2"] = "www"
            item["snatch_time"] = datetime.datetime.now()
            return item

        except:
            #errors are log in error1.json with url
            line = response.url + "\n"
            self.file = codecs.open('error1.json', 'ab', encoding='utf-8')
            self.file.write(line.decode("unicode_escape"))
            pass
示例#12
0
    def parse_pages(self, response):
        self.log('Hi, this is an item page! %s' % response.url)

        root = bs(response.body.decode('gbk'))
        try:
            pageText = root.find("div", attrs={
                "class": "pg"
            }).find("span").text

            pageText = pageText[pageText.index("/") + 2:]
            pageText = pageText[:pageText.index(" ")]
        except:
            pageText = 1
        root_url = response.url
        root_url = root_url[:root_url.index(".html") - 1]
        # for i in range(1, int(pageText)+1):
        #     url = root_url + str(i) +".html"
        #     yield scrapy.Request(url, self.parse_page, meta={'forumurl': root_url + "1.html"})
        #
        new = 0
        url = root_url + str(1) + ".html"
        while new > 20:
            r = requests.get(url)
            root = bs(r.text)
            url = root.find("div", attrs={
                "class": "pg"
            }).find("a", attrs={
                "class": "nxt"
            }).get("href")
            table = root.find("div", id="threadlist")
            trs = table.findAll("tr")
            for tr in trs:
                item = FenghuoItem()
                item["domain_1"] = "437600.net"
                item["domain_2"] = "www"
                item["site_id"] = 12
                item['website_id'] = ''
                item["site_name"] = '通山信息港'
                item["area"] = 958
                item["site_weight"] = 2
                item['countryid'] = 1156
                item['province'] = 1673
                item['city'] = 136
                item["ip"] = socket.gethostbyname("www.437600.net")
                item["site_url"] = "www.437600.net"
                item["forumurl"] = response.meta['forumurl']
                item["site_type"] = '论坛'
                item["snatch_time"] = datetime.datetime.now()
                try:
                    item["url"] = root_domain + tr.find(
                        "a", attrs={
                            "href": re.compile("^thread")
                        }).get("href")
                    item["title"] = tr.find("th").find(
                        "a", attrs={
                            "href": re.compile("^thread")
                        }).text
                    item["author"] = tr.find("td", attrs={
                        "class": "by"
                    }).find("cite").find("a").text
                    item["userpage"] = root_domain + tr.find(
                        "td", attrs={
                            "class": "by"
                        }).find("cite").find("a").get("href")
                    url_id = tr.findAll("td", attrs={
                        "class": "by"
                    })[0].find("cite").find("a").get("href")
                    url_id = url_id[url_id.index("uid-") + 4:]
                    item["userid"] = url_id[:url_id.index(".html")]
                    try:
                        item["reply"] = tr.find("td", attrs={
                            "class": "num"
                        }).find("a").text
                    except:
                        item["reply"] = ""
                    try:
                        item["view"] = tr.find("td", attrs={
                            "class": "num"
                        }).find("em").text
                    except:
                        item["view"] = ""

                    try:
                        item["postid"] = tr.findAll(
                            "td", attrs={"class":
                                         "by"})[1].find("cite").find("a").text
                    except:
                        item["postid"] = ""

                    try:
                        item["subname"] = root_domain + tr.find("th").find(
                            "em").find("a").text
                    except:
                        item["subname"] = ""

                    try:
                        time1 = tr.findAll("td",
                                           attrs={"class": "by"
                                                  })[1].find("em").find("a")
                        try:
                            item["updatetime"] = time1.find("span").get(
                                "title")
                        except:
                            item["updatetime"] = time1.text

                    except:
                        item["updatetime"] = ""
                    #explore the content of the page

                    #增量的判断
                    last_t = ""
                    if item["updatetime"] and item["updatetime"] > last_t:
                        new += 1
                        yield scrapy.Request(item["url"],
                                             self.parse_item,
                                             meta={'item': item})

                    if item["updatetime"] and item["updatetime"] == last_t:
                        new += 1
                        #update before
                        pass

                except Exception as e:
                    #print e
                    #print tr
                    line = str(tr) + "\n"
                    self.file = codecs.open('error1.json',
                                            'ab',
                                            encoding='utf-8')
                    self.file.write(line.decode("unicode_escape"))
示例#13
0
    def parse(self, response):
        self.log('Hi, this is an item page! %s' % response.url)

        root_url = response.url

        last_t = "0"
        new = 21
        url = root_url
        has_next = True
        while new > 5 and has_next:
            new = 0
            has_next = False
            r = requests.get(url)
            r.encoding = "gbk"
            root = bs(r.text)
            table = root.find("div", attrs={"id": "wrapper"})
            try:
                trs = table.findAll("tbody")
            except:
                line = response.url + "\n"
                self.file = codecs.open('error1.json', 'ab', encoding='utf-8')
                self.file.write(line.decode("unicode_escape"))
                continue
            for tr in trs:
                item = FenghuoItem()
                item["domain_1"] = "461700.org"
                item["domain_2"] = "www"
                item["site_id"] = 46
                item['website_id'] = ''
                item["site_name"] = '襄城论坛'
                item["area"] = 3507
                item["site_weight"] = 2
                item['countryid'] = 1156
                item['province'] = 1673
                item['city'] = 996
                item["ip"] = socket.gethostbyname("www.461700.org")
                item["site_url"] = "www.461700.org"
                item["forumurl"] = root_url
                item["site_type"] = '论坛'
                item["snatch_time"] = datetime.datetime.now()
                # try:
                item["url"] = root_domain + tr.find(
                    "a", attrs={
                        "href": re.compile("^show")
                    }).get("href")
                item["title"] = tr.find("a",
                                        attrs={
                                            "href": re.compile("^show")
                                        }).text
                item["author"] = tr.find("td", attrs={
                    "class": "author"
                }).find("cite").text
                try:
                    item["reply"] = tr.find("td", attrs={
                        "class": "nums"
                    }).find("strong").text
                except:
                    item["reply"] = ""
                try:
                    item["view"] = tr.find("td", attrs={
                        "class": "nums"
                    }).find("em").text
                except:
                    item["view"] = ""

                try:
                    item["postid"] = tr.findAll("td",
                                                attrs={"class": "lastpost"
                                                       })[1].find("cite").text
                except:
                    item["postid"] = ""

                try:
                    item["subname"] = tr.find("div",
                                              attrs={
                                                  "class": "tietitle"
                                              }).find("em").find("a").text
                except:
                    item["subname"] = ""

                new += 1
                print item["url"]
                yield scrapy.Request(item["url"],
                                     self.parse_item,
                                     meta={'item': item})

            try:
                urls = root.find("div", attrs={
                    "class": "meneame"
                }).findAll("a")
                for u in urls:
                    if u.text == "下一页":
                        url = root_domain + u.get("href")
                        has_next = True
                        print url
            except:
                break
示例#14
0
    def parse_item(self, response):
        item = response.meta['item']
        #item = FenghuoItem()
        item['articles'] = []
        item["txt"] = ""
        new_root = bs(response.body.decode('gbk'))
        subnames = new_root.find("div", id="nav1").findAll("a")
        item["subname"] = subnames[len(subnames) - 1].text
        url = response.url
        hasNext = True
        try:
            while hasNext:
                hasNext = False
                r1 = requests.post(url)
                r1.encoding = 'gbk'
                new_root = bs(r1.text)
                divs = new_root.find("div", id="wrapper").findChildren(
                    'table', attrs={"class": "showTie"})
                for div in divs:
                    c_item = FenghuoItem()
                    c_item["topPost"] = 0
                    try:
                        c_item['author'] = div.find("td",
                                                    attrs={
                                                        "class": "aa"
                                                    }).find("a").text
                        c_item['userpage'] = div.find("td",
                                                      attrs={
                                                          "class": "aa"
                                                      }).find("a").text
                        times = div.find("td", attrs={
                            "class": "aa"
                        }).findAll("li")
                        c_item["pubdate"] = times[len(times) - 1].text
                        item["updatetime"] = c_item["pubdate"]
                        try:
                            c = div.find("td", attrs={
                                "class": "bb"
                            }).find("span").text
                            c_item["postfloor"] = c[:c.index("阅") - 1]
                            if c_item["postfloor"] == "楼主":
                                c_item["postfloor"] = 1
                                c_item["topPost"] = 1
                                item["pubdate"] = c_item["pubdate"]
                                item['userpage'] = c_item['userpage']
                        except:
                            c = div.find("td", attrs={
                                "class": "bb"
                            }).find("div", attrs={
                                "class": "tiefoot s_clear"
                            }).find("span").text
                            c = c[c.index("回复") + 2:]
                            c_item["postfloor"] = c[:c.index("楼")]
                            c_item["postfloor"] = int(c_item["postfloor"]) + 1
                        styles = div.findAll("style")
                        scripts = div.findAll("script")
                        for style in styles:
                            style.clear()
                        for script in scripts:
                            script.clear()

                        #替换所有图片标签
                        imgs = div.find("td", attrs={
                            "class": "bb"
                        }).find("div").findAll("img")
                        for img in imgs:
                            img.replaceWith(img.prettify())
                        c_item["txt"] = div.find("td", attrs={
                            "class": "bb"
                        }).find("div").text.replace("<br />", " ").replace(
                            "\r\n", "$*huanhang*$").replace("\"", "‘")
                        strs = str(c_item["postfloor"]) + "." + str(
                            c_item["txt"]) + "\n"
                        item["txt"] += strs
                        item["txt_len"] = len(item["txt"])
                        c_item["txt_len"] = len(c_item["txt"])
                        item['articles'].append(dict(c_item))

                    except:
                        line = str(div) + "\n"
                        self.file = codecs.open('error2.json',
                                                'ab',
                                                encoding='utf-8')
                        self.file.write(line.decode("unicode_escape"))
                        pass
                    #
                    # # #clear css, js , advertisement and messy code
                try:
                    curls = new_root.find("div", attrs={
                        "class": "meneame"
                    }).findAll("a")
                    for curl in curls:
                        if curl.text == "下一页" and url != root_domain + curl.get(
                                "href"):
                            url = curl
                            hasNext = True
                            print url
                except:
                    pass
            return item
        except:
            line = response.url + "\n"
            self.file = codecs.open('error2.json', 'ab', encoding='utf-8')
            self.file.write(line.decode("unicode_escape"))
            pass
示例#15
0
    def parse_items(self, response):

        #title done
        #txt:main work,focus on parse HTML and remove impurity
        #pubdate done
        #snatch_time static
        #site_url static
        #site_name static
        #url done
        #topPost static
        #url done

        #init:
        html_parser = HTMLParser.HTMLParser()
        item = FenghuoItem()
        url = response.url

        self.log("Hi,this is in parse_items,url is %s" % url)
        root = bs(response.body)
        div = root.find("div", "conText")
        strong1 = div.find("strong", "fromSummary")
        pubdate = div.find("strong", "timeSummary").text
        ps = div.find("div", attrs={"id": "text"}).findAll("p")
        item_page = div.find("div", attrs={"id": "pages"})
        title = div.find("h1").text
        html = ""
        #see if there's rubbish in ps remove:
        print ps[-1]
        if ps[-1].find("div", "page"):
            del ps[-1]
        # get txt by paragraph
        for p in ps:
            #remove comments in text:
            comments = p.findAll(text=lambda text: isinstance(text, Comment))
            [comment.extract() for comment in comments]
            html = html + '\n' + p.text.encode('utf-8')
        text = html_parser.unescape(html)
        item['url'] = url
        item['title'] = html_parser.unescape(title)
        item['txt'] = text
        item['pubdate'] = str(pubdate)
        item['snatch_time'] = datetime.datetime.now()
        item['topPost'] = 1
        item['site_name'] = '环球国内新闻'
        item['site_url'] = "china.huanqiu.com/"
        # see if the txt of this page has a next page
        if item_page:
            next_page = item_page.findAll("a")
            for a in next_page:
                if a.text == "下一页":
                    next_url = a.get('href')  # get the next page
        else:
            fp.process_item(
                item,
                "123")  #all txt in a single page ,just process item to DB

        if next_url == url:
            #means we are parsing the last page  of the whole txt
            print next_url
        else:
            #means we stiill has a next page to request to get the remaining part of the txt
            yield scrapy.Request(url=next_url,
                                 meta={'item': item},
                                 callback=self.parse_items_page)