Пример #1
0
    def parse_detail(self, response):

        #文章的标题
        title = response.xpath('//h1[@class="ph"]/text()').get()

        author_a = response.xpath('//p[@class="authors"]')
        # 文章的作者
        author = author_a.xpath('./a/text()').get()

        #文章的发表时间
        pub_time = author_a.xpath('./span/text()').get()

        #文章的内容
        article_content = response.xpath(
            '//td[@id="article_content"]//text()').getall()
        #转为字符串
        article_content = ''.join(article_content).strip()

        #被阅读的次数
        GitHub_Star = response.xpath('//div[@class="cl"]/div/a/text()').get()

        #实例化一个字典来存储数据
        item = WxappItem()

        item['title'] = title
        item['author'] = author
        item['pub_time'] = pub_time
        item['article_content'] = article_content
        item['GitHub_Star'] = GitHub_Star

        yield item
Пример #2
0
 def parse_detail(self, response):
     title = response.xpath('//h1[@class="ph"]/text()').get()
     author_p = response.xpath('//p[@class="authors"]')
     author = author_p.xpath('./a/text()').get()
     pub_time = author_p.xpath('./span/text()').get()
     article_content = response.xpath('//td[@id="article_content"]//text()').getall()
     article_content = "".join(article_content).strip()
     item = WxappItem(title=title, author=author, pub_time=pub_time, content = article_content)
     yield item
Пример #3
0
    def parse_detial(self, response):
        title = response.xpath("//h1[@class='ph']/text()").get()
        author = response.xpath("//p[@class='authors']//a/text()").get()
        content = response.xpath(
            '//td[@id="article_content"]//text()').getall()
        content = "".join(content).split()

        item = WxappItem(title=title, author=author, content=content)
        yield item
Пример #4
0
 def parse_detail(self, response):
     title = response.xpath("//h1[@class='ph']/text()").get()
     author_p = response.xpath("//p[@class='authors']")
     # author = author_p.xpath(".//a/text()").get()
     pub_time = author_p.xpath(".//span/text()").get()
     # article_content = response.xpath("//td[@id='article_content']//text()").getall()
     # content = "".join(article_content).strip()
     item = WxappItem(title=title, pub_time=pub_time)
     yield item
Пример #5
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='ph']/text()").get()  # 标题
        author_p = response.xpath("//p[@class='authors']")  # 作者的p标签
        author = author_p.xpath(".//a/text()").get()  # 作者
        pub_time = author_p.xpath(".//span/text()").get()  # 发布时间
        content = response.xpath("//td[@id='article_content']//text()").getall()
        content = "".join(content).strip()  # 文章内容

        item = WxappItem(title=title, author=author, pub_time=pub_time, content=content)
        yield item
Пример #6
0
 def parse_detail(self, response):
     title = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[1]/h1//text()').get()
     author_p = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[3]/div[1]/p')
     author = author_p.xpath('.//a/text()').get()
     pub_time = author_p.xpath('.//span//text()').get()
     # print('author: %s/pub_time:%s'%(author, pub_time))
     content = response.xpath('//*[@id="article_content"]//text()').getall()
     content = "".join(content).strip()
     item = WxappItem(title=title, author=author, pub_time=pub_time, content=content)
     yield item
Пример #7
0
    def parse_item(self, response):
        title = response.xpath(r"//h1[@class='ph']/text()").get()
        author = response.xpath(r"//p[@class='authors']/a/text()").get()
        pub_time = response.xpath(r"//span[@class='time']/text()").get()
        content = response.xpath(r"//div[@class='blockquote']/p/text()").get()

        item = WxappItem(title=title,
                         author=author,
                         pub_time=pub_time,
                         content=content)
        yield item
Пример #8
0
    def parse_detail(self, response):
        print("\033[0;37;40m\t=============URL===============\033[0m")
        print("\033[0;37;40m\t" + response.request.url + "\033[0m")

        title = response.xpath("//div[@class='h hm cl']/div[@class='cl']/h1/text()").get().strip()
        author = response.xpath("//div[@class='avatar_right cl']//p[@class='authors']/a/text()").get().strip()
        time = response.xpath("//div[@class='avatar_right cl']//p[@class='authors']/span/text()").get().strip()
        content = response.xpath("//div[@class='content_middle cl']/div[@class='d']/table//td[@id='article_content']//text()").getall()
        content = "".join(content).strip()
        item = WxappItem(title=title,author=author,time=time,content=content)
        yield item
Пример #9
0
 def parse_detail(self, response):
     # print(response.text)
     title = response.xpath('//*[@id="ct"]/div[1]/div/div[1]/div/div[2]/div[1]/h1/text()').get()
     auther_info = response.xpath('///p[@class="authors"]')
     author = auther_info.xpath('.//a/text()').get()
     pub_time = auther_info.xpath('.//span/text()').get()
     # 爬取内容中所有的文字 并且将其转化为字符串 而且去掉左右的空格
     article_content = response.xpath("//div[@class='content_middle cl']//text()").getall()
     content = "".join(article_content).strip()
     item = WxappItem(title=title,author=author,pub_time=pub_time,content=content)
     yield item
Пример #10
0
 def parse_item(self, response):
     title = response.xpath("//h1[@class='ph']/text()").get()
     # author_p = response.xpath('//p[@class="authors"]')
     # author = author_p.xpath('.//a/text()').get()
     # pub_time = author_p.xpath('.//span/text()').get()
     # print('author:%s/pub_time:$s' % (author,pub_time))
     wen = response.xpath('//td[@id="article_content"]//text()').getall()
     wen = ''.join(wen).strip()
     # print(wen)
     item = WxappItem(title=title,wen=wen)
     yield item
Пример #11
0
 def parse_item(self, response):
     title = response.xpath('//h1[@class="ph"]/text()').get()
     authors = response.xpath('//p[@class="authors"]/a/text()').get()
     pub_time = response.xpath('//p[@class="authors"]/span/text()').get()
     content = "".join(
         response.xpath(
             '//td[@id="article_content"]//text()').getall()).strip()
     item = WxappItem(title=title,
                      authors=authors,
                      pub_time=pub_time,
                      content=content)
     yield item
Пример #12
0
 def parse_detail(self,reponse):
     title = reponse.xpath("//h1/text()").get()
     author_p = reponse.xpath("//p[@class='authors']")
     author_name  =author_p.xpath("./a/text()").get()
     author_pubdate = author_p.xpath("./span/text()").get()
     content = reponse.xpath("//section/p/text()").getall()
     content = "".join(content).strip()
     print("title is %s \n"%title)
     print("the author is %s ,pubdate is %s"%(author_name,author_pubdate))
     print("content    : %s"%content)
     items = WxappItem(title= title,author_name=author_name,author_pubdate=author_pubdate,content=content)
     yield  items
Пример #13
0
 def parse_item(self, response):
     title = response.xpath("//h1[@class='ph']/text()").get()
     author = response.xpath("//p[@class='authors']/a/text()").get()
     time = response.xpath("//p[@class='authors']/span/text()").get()
     content = response.xpath(
         "//td[@id='article_content']//text()").getall()
     content = "".join(content).strip()
     item = WxappItem(title=title,
                      author=author,
                      pub_time=time,
                      content=content)
     yield item
Пример #14
0
 def parse_detall(self, response):
     title = response.xpath("//h1[@class='ph']/text()").get()
     author_1 = response.xpath("//p[@class='authors']")
     author = author_1.xpath(".//a/text()").get()
     put_time = author_1.xpath("//span[@class='time']/text()").get()
     article_content = response.xpath(
         "//td[@id='article_content']//text()").getall()
     content = "".join(article_content).strip()
     item = WxappItem(title=title,
                      author=author,
                      put_time=put_time,
                      content=content)
     yield item
Пример #15
0
 def parse_detail(self, response):
     title = response.xpath('//h1[@class="ph"]/text()').get().strip()
     author = response.xpath('//p[@class="authors"]/a/text()').get().strip()
     pub_time = response.xpath(
         ' //p[@class="authors"]/span/text()').get().strip()
     content = response.xpath(
         '//td[@id="article_content"]//text()').getall()
     item = WxappItem(title=title,
                      author=author,
                      pub_time=pub_time,
                      content=content)
     yield item
     print('--' * 30)
Пример #16
0
 def parse_detail(self, response):  #这个方法命名 要避开parse这个名字 因为 底层执行的就是parse方法
     #如果重写 容易冲掉底层的方法
     title = response.xpath("//h1[@class='ph']/text()").get()
     author_p = response.xpath("//p[@class='authors']")
     author = author_p.xpath(".//a/text()").get()
     pub_time = author_p.xpath(".//span/text()").get()
     content = response.xpath(
         "//td[@id='article_content']//text()").getall()
     content = "".join(content).strip()
     item = WxappItem(author=author,
                      title=title,
                      pub_time=pub_time,
                      content=content)
     yield item
Пример #17
0
 def parse_item(self, response):
     title = response.xpath('//h1[@class="ph"]/text()').get()
     author_p = response.xpath('//p[@class="authors"]')
     author = author_p.xpath('.//a/text()').get()
     pub_time = author_p.xpath('.//span/text()').get()
     article_content = ''.join(
         response.xpath(
             '//td[@id="article_content"]//text()').getall()).split()
     item = WxappItem()
     item['title'] = title
     item['author'] = author
     item['pub_time'] = pub_time
     item['article_content'] = article_content
     return item
Пример #18
0
 def parse_detail(self, response):
     title = response.xpath("//h1[@class='ph']/text()").get()
     res_p = response.xpath("//p[@class='authors']")
     author = res_p.xpath(".//a/text()").get()
     time = res_p.xpath(".//span/text()").get()
     article_content = response.xpath(
         "//td[@id='article_content']//text()").getall()
     article_content = ''.join(article_content).strip()
     #print(article_content)
     item = WxappItem(title=title,
                      author=author,
                      time=time,
                      content=article_content)
     yield item
Пример #19
0
    def parse_item(self, response):
        item = {}
        # item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        # item['name'] = response.xpath('//div[@id="name"]').get()
        # item['description'] = response.xpath('//div[@id="description"]').get()

        title = response.xpath('//h1[@class="ph"]/text()').get()
        authorArea = response.xpath('//div[@class="avatar_right cl"]//p')
        author = authorArea.xpath('./a/text()').get()
        pub_time = authorArea.xpath('./span/text()').get()
        content = response.xpath('//td[@id="article_content"]//text()').getall()
        content = ''.join(content).strip()
        item = WxappItem(title=title, author=author, pub_time=pub_time, content=content)
        return item
Пример #20
0
    def parse_detail(self, response):
        title = response.xpath("//h1[@class='ph']/text()").get()
        author_p = response.xpath("//p[@class='authors']")
        author = author_p.xpath("./a/text()").get()
        pub_time = author_p.xpath(".//span[@class='time']/text()").get()
        article = author_p.xpath(
            "//td[@id='article_content']//text()").getall()
        #print('authoe:%s,time:%s' % (author,pub_time))
        article = ''.join(article).strip()
        item = WxappItem(title=title,
                         author=author,
                         pub_time=pub_time,
                         article=article)

        yield item
Пример #21
0
    def parse_detail(self, response):   #  爬取详情网页
        # fp = open("wx.json", "wb")
        # fp.close()  #  爬虫结束后,关闭文件
        title = response.xpath("//*[@id='ct']/div[1]/div/div[1]/div/div[2]/div[1]/h1/text()").get()  #  提起文章标题
        author = response.xpath("//p[@class='authors']//a/text()").get()  #  提起作者
        time = response.xpath("//*[@id='ct']/div[1]/div/div[1]/div/div[2]/div[3]/div[1]/p/span/text()").get()  #  提起发表时间
        content = response.xpath("//td//text()").getall()  #  提起文章内容
        content = "".join(content).strip()  #  list类型转换为字符型
        item = WxappItem(title=title, author=author, time=time, content=content)  #  传参
        yield item  #  item传给管道pipelines

        print("=" * 40)
        print(item)

        print("=" * 40)
Пример #22
0
 def parse_detail(self, response):
     title = response.xpath('//h1[@class="ph"]/text()').get()
     authors = response.xpath('//p[@class="authors"]')
     author = authors.xpath('.//a/text()').get()
     pub_time = authors.xpath('.//span/text()').get()
     content = response.xpath(
         '//td[@id="article_content"]//text()').getall()
     content = ''.join(content).strip()
     # print('author:%s/pub_time:%s' % (author, pub_time))
     # print(content)
     item = WxappItem(title=title,
                      author=author,
                      pub_time=pub_time,
                      content=content)
     # 等同return
     yield item
Пример #23
0
    def parse_detail(self, response):
        print('=' * 30)
        print(response.url)

        title = response.xpath('//h1/text()').get()
        authors = response.xpath('//p[@class="authors"]')
        author = authors.xpath('./a/text()').get()
        date = authors.xpath('./span/text()').get()
        article_content = response.xpath('//td[@id="article_content"]').get()

        item = WxappItem(title=title,
                         author=author,
                         date=date,
                         content=article_content)

        yield item
Пример #24
0
 def parse_item(self, response):
     # item = {}
     #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
     #item['name'] = response.xpath('//div[@id="name"]').get()
     #item['description'] = response.xpath('//div[@id="description"]').get()
     title = response.xpath("//h1[@class='ph']/text()").get()
     author = response.xpath("//p[@class='authors']/a/text()").get()
     push_time = response.xpath("//p[@class='authors']/span/text()").get()
     content = response.xpath(
         "//td[@id='article_content']//text()").getall()
     content = "".join(content).strip()
     item = WxappItem(title=title,
                      author=author,
                      push_time=push_time,
                      content=content)
     return item
Пример #25
0
    def parse_detail(self, response):
        item = WxappItem()
        title = response.xpath('//div/h1[@class="ph"]/text()').get()
        author_p = response.xpath('//div/p[@class="authors"]')
        author = author_p.xpath('.//a/text()').get()
        pub_time = author_p.xpath('.//span[@class="time"]/text()').get()
        artitcle_content = response.xpath(
            '//td[@id="article_content"]//text()').getall()
        content = "".join(artitcle_content).strip()  # 将其转换为字符串,并去掉空白

        item['title'] = title
        item['author'] = author
        item['pub_time'] = pub_time
        item['content'] = content

        yield item
Пример #26
0
 def parse_detail(self, response):
     title = response.xpath("//h1[@class='ph']/text()").get()
     author_p = response.xpath("//p[@class='authors']")
     author = author_p.xpath(".//a/text()").get()
     time = author_p.xpath(".//span/text()").get()
     content = response.xpath(
         "//td[@id='article_content']//text()").getall()
     content = "".join(content).strip()
     item = WxappItem(title=title,
                      author=author,
                      time=time,
                      content=content)
     yield item
     print('author:%s/pub_time:%s' % (author, time))
     print(title)
     print(content)
Пример #27
0
 def parse_item(self, response):
     # item = {}
     #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
     #item['name'] = response.xpath('//div[@id="name"]').get()
     #item['description'] = response.xpath('//div[@id="description"]').get()
     soup = BeautifulSoup(response.text, 'lxml')
     title = soup.select_one(
         '#ct > div.mn > div > div.middle_info.cl > div > div.h.hm.cl > div:nth-child(1) > h1'
     ).text
     describe = soup.select_one(
         '#ct > div.mn > div > div.middle_info.cl > div > div.blockquote > p'
     ).text
     print(title)
     print('=' * 50)
     # print(describe)
     # print('='*50)
     yield WxappItem(title=title, describe=describe)
Пример #28
0
 def parse_detail(self, response):
     title = response.xpath("//h1[@class='ph']/text()").get()
     authors_p = response.xpath("//p[@class='authors']")
     author = authors_p.xpath(".//a/text()").get()
     pub_time = authors_p.xpath(".//span/text()").get()
     # print('author:%s/pub_time:%s' % (author, pub_time))
     # print('=' * 60)
     article_content = response.xpath(
         "//td[@id='article_content']//text()").getall()
     article_content = "".join(article_content).strip()  #除去空白字符
     # print(article_content)
     # print('='*60)
     item = WxappItem(title=title,
                      author=author,
                      pub_time=pub_time,
                      content=article_content)
     yield item
Пример #29
0
 def parse_detail(self, response):
     item = {}
     # get title
     title = response.xpath('//h1[@class="ph"]/text()').get()
     # get title
     author = response.xpath('//p[@class="authors"]/a/text()').get()
     # get time
     pub_time = response.xpath('//p[@class="authors"]/span/text()').get()
     # get content
     article_content = response.xpath(
         '//td[@id="article_content"]//text()').getall()
     article_content = ''.join(article_content).strip()
     item = WxappItem(title=title,
                      author=author,
                      pub_time=pub_time,
                      article_content=article_content)
     yield item
Пример #30
0
 def parse_detail(self, response):
     print('-' * 80)
     # title = response.xpath('//h3[@class="list_title"]//text()').get()
     title = response.xpath("//h1[@class='ph']/text()").get()
     author = response.xpath("//p[@class='authors']/a/text()").get()
     pub_time = response.xpath("//span[@class='time']/text()").get()
     article_content = ''.join(
         response.xpath(
             "//td[@id='article_content']//text()").getall()).strip()
     print('title:', title, 'author:', author, 'pub_time:', pub_time)
     print('article_content:', article_content)
     item = WxappItem(title=title,
                      author=author,
                      pub_time=pub_time,
                      article_content=article_content)
     print('-' * 80)
     yield item