예제 #1
0
    def parse_news(self, response):
        soup = bs(response.text, "html.parser")
        item = DemoItem()

        item["category1"] = response.meta["category1"]
        item["category2"] = response.meta["category2"]
        pub_time = soup.find(
            "time",
            "entry-date updated td-module-date").text.strip() if soup.find(
                "time",
                "entry-date updated td-module-date") else "0000-00-00 00:00:00"
        item["pub_time"] = Util.format_time2(pub_time)
        title = soup.find("h1", "entry-title").text.strip() if soup.find(
            "h1", "entry-title") else None
        item["title"] = title
        div = soup.find("div", "td-post-content tagdiv-type")
        images = [img.get("src") for img in div.find_all("img")
                  ] if div.find_all("img") else None
        abstract = div.find("p").text.strip()
        body = [p.text.strip()
                for p in div.find_all("p")] if div.find_all("p") else None
        if abstract:
            body = "\n".join(body)
        else:
            abstract = div.find("h4").text.strip()
            body = [h.text.strip() for h in div.find_all("h4")
                    ] if div.find_all("h4") else None
            body = "\n".join(body)
        item["images"] = images
        item["abstract"] = abstract
        item["body"] = body
        self.logger.info(item)
예제 #2
0
    def parse_news(self, response):
        item = DemoItem()
        soup = bs(response.text, "html.parser")
        item["category1"] = response.meta["category1"]
        item["category2"] = response.meta["category2"]

        item["title"] = soup.find(class_="post-title entry-title").text.strip()
        item["pub_time"] = Util.format_time2(
            soup.find(class_="published timeago").text.strip())

        content = soup.find(class_="post-body entry-content")
        images = [img.get("src") for img in content.find_all("img")
                  ] if content.find_all("img") else []
        item["images"] = images
        body1 = ''
        for div in content.find_all(dir="ltr"):
            body1 += (div.text.strip() + '\n')
        if body1 == '':
            body1 = content.text

        body = ''
        for b in body1.split("\n"):
            if b != '':
                body += (b + '\n')
        item["body"] = body
        item["abstract"] = body.split("\n")[0]
        yield item
예제 #3
0
 def parse_detail(self, response):
     item = DemoItem()
     soup = BeautifulSoup(response.text, features="lxml")
     item['title'] = soup.select_one(
         ".o-article .entry-content h1").text.strip()
     item['pub_time'] = khulasaa_time_switch(
         soup.select_one(".author-disc .date .author span").text)
     images = []
     for img in soup.select(".content-section .featured-box img"):
         images.append(img.get("src"))
     item['images'] = images
     abstract = ""
     for a in soup.select(".post-content ul li h3"):
         abstract += a.text.strip()
         abstract += "\n"
     item['abstract'] = abstract
     body = ""
     for b in soup.select(".post-content p"):
         body += b.text.strip()
     item['body'] = body
     item['category1'] = soup.select_one(
         ".breadcrumb span span span a").text.strip()
     item['category2'] = None
     item['request_url'] = response.request.url
     item['response_url'] = response.url
     item['website_id'] = self.website_id
     item['language_id'] = self.language_id
     item['cole_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                       time.localtime(int(time.time())))
     yield item
예제 #4
0
파일: malaya.py 프로젝트: gitzdx/crawler
    def parse_item(self, response):
        soup = BeautifulSoup(response.text, 'html.parser')
        item = DemoItem()
        category = response.url.split('/')[-3].split('_')
        if len(category) == 3:
            item['category1'] = category[1]
            item['category2'] = category[2]
        else:
            item['category1'] = category[0]
            item['category2'] = category[1]

        item['title'] = soup.select_one('h1.entry-title').text

        item['pub_time'] = Util.format_time2(
            soup.select('span.td-post-date > time')[0].text)
        item['images'] = [
            i.get('data-src') for i in soup.select('div.td-post-content img')
        ]
        item['abstract'] = soup.select('div.td-post-content > p')[0].text

        ss = ''
        for i in soup.select('div.td-post-content > p'):
            ss += i.text + r'\n'
        item['body'] = ss

        return item
예제 #5
0
    def parse(self, response):
        '''
        :param response:
        :return:一级目录链接
        '''
        item = DemoItem()

        soup = bs(response.text, "html.parser")
        for li in soup.select("#menu-main-menu-1 > li.menu-item")[1:-2]:
            a = li.select_one("a")
            item['category1'] = a.text
            category1_url = a.get("href")

            if li.find("ul", class_="sub-menu"):
                for sub_a in li.find("ul", class_="sub-menu").select("a"):
                    item['category2'] = sub_a.text
                    category2_url = sub_a.get("href")

                    yield scrapy.Request(category2_url,
                                         callback=self.get_next_page,
                                         meta={"item":
                                               item})  # 层与层之间通过meta参数传递数据
            else:
                item['category2'] = None
                yield scrapy.Request(category1_url,
                                     callback=self.get_next_page,
                                     meta={"item": item})  # 层与层之间通过meta参数传递数据
예제 #6
0
 def parse_news(self, response):
     soup = bs(response.text)
     item = DemoItem()
     item["pub_time"] = Util.format_time2(
         soup.select('.post-meta > span')[1].text)
     title = soup.find(
         "h1", class_="post-title entry-title").text.strip() if soup.find(
             "h1", class_="post-title entry-title") else None
     item["title"] = title
     image = [
         soup.find("div", class_="single-post-thumb").find("img").get("src")
     ] if soup.find("div", class_="single-post-thumb") else None
     item["images"] = image
     category1 = soup.select_one(
         "#main-content > div > article > div > p > span:nth-child(3) > a"
     ).text.strip()
     item["category1"] = category1
     item["category2"] = None
     abstract = soup.find(
         "div", class_="entry").find("p").text.strip() if soup.find(
             "div", class_="entry") else soup.find(
                 "div", class_="entry").find("p").text.strip()
     item["abstract"] = abstract
     body = [
         p.text.strip()
         for p in soup.find("div", class_="entry").find_all("p")
     ] if soup.find("div", class_="entry") else None
     body = "\n".join(body)
     item["body"] = body
     yield item
예제 #7
0
    def parse_details(self,response):
        item=DemoItem()
        soup=BeautifulSoup(response.text,'lxml')
        item['category1']=response.meta['category1']
        item['category2']=response.meta['category2']

        item['title']=soup.find('h1',class_='post-title entry-title').text.strip() if soup.find('h1',class_='post-title entry-title') else None

        item['body'] = ''#不能忘记初始化
        item['abstract']=''
        if soup.select('.entry-content p,.entry-content h3'):
            body_list=soup.select('.entry-content p,.entry-content h3')#这个写法可以同时提取到多个不同的标签
            for body in body_list:
                item['body'] += body.text.strip()
                item['body'] +='\n'
            item['abstract']=body_list[0].text.strip() 

           
        
        item['images']=[]
        image_list=soup.select('.entry-content p>img,.single-featured-image>img')if soup.select('.entry-content p>img,.single-featured-image>img') else None
        if(image_list):
            for image in image_list:
                image=image.get('src')
                item['images'].append(image)


        pub=soup.find('span',class_='date meta-item tie-icon').text.strip() if soup.find('span',class_='date meta-item tie-icon') else None
        if(pub):
            pub=Util.format_time2(pub)
            item['pub_time']=pub
        
        yield item
예제 #8
0
 def parse_item(self, response):
     soup = BeautifulSoup(response.text, 'html.parser')
     item = DemoItem()
     item['title'] = response.meta['title']
     item['category1'] = response.meta['category1']
     item['abstract'] = response.meta['abstract']
     item['images'] = response.meta['images']
     item['category2'] = response.meta['category2']
     if re.findall('headline', response.url):  # 一般新闻
         ss = ''
         for i in soup.select('.dit > p > b'):
             ss += i.text + '\n'
         try:
             ss += soup.select_one('.dit > p > span').text
         except:
             pass
         item['body'] = ss
         tt = soup.select_one('.colort').text.split(
         )  # 形如 ['Wednesday', '6', 'January', '2021', '02:12:12', 'PM']
         tt = tt[2] + ' ' + tt[1] + ' ' + tt[3] + ' ' + tt[4] + ' ' + tt[
             5]  # 形如 January 6 2021 02:12:12 PM
         item['pub_time'] = Util.format_time2(tt)
     elif re.findall('watchvid', response.url):  # 视频新闻
         item['body'] = soup.select_one('.dit > p').text
         item['pub_time'] = soup.select_one('.colort').text
     else:  # 图片新闻
         item['body'] = soup.select_one('.news_saa > p').text
         item['pub_time'] = Util.format_time(0)
     return item
예제 #9
0
파일: pna.py 프로젝트: youuuoy/crawler
    def parse_item(self, response):
        soup = bs(response.text, 'html.parser')
        item = DemoItem()
        item['category1'] = response.meta['category1']
        item['category2'] = None
        # 只好用url 里面的数字代替二级标题了。
        # 解决了,加meta 参数到Request()

        item['title'] = soup.select('div.page-header h1')[0].text
        ts = soup.select('span.date ')[0].text  # 文章时间字符串例如 ts = 'Published October 22, 2020, 4:32 PM' #下面将ts 格式化
        month = Util.month2[ts.split(',')[0].split(' ')[1]]
        date = ts.split(',')[1] + '-' + month + '-' + ts.split(',')[0].split(' ')[2]
        date.strip()  # 去掉多余的空格
        ttt = ts.split(',')[-1].split(' ')  # ttt = ['', '4:32', 'PM']
        if ttt[-1] == 'PM':
            shi = int(ttt[-2].split(':')[0]) + 12
            time = str(shi) + ":" + ttt[-2].split(':')[1] + ":" + '00'
        else:
            shi = int(ttt[-2].split(':')[0])
            time = str(shi) + ":" + ttt[-2].split(':')[1] + ":" + '00'
        datetime = date + ' ' + time
        item['pub_time'] = datetime
        try:
            item['images'] = [i.get('src') for i in soup.select('div.page-content  img')]
        except:
            pass

        item['abstract'] = soup.select('div.page-content > p')[0].text

        ss = ''
        for i in soup.select('div.page-content > p'):
            ss += i.text + r'\n'
        item['body'] = ss
        yield item
예제 #10
0
    def parse(self, response):
        for tr in response.xpath('//table[@class="Tab"]//tr[position()>1]'):
            itemRow = {
                'demo': tr.xpath('td[1000]/a/@id').get().strip(),
                'ss':
                tr.xpath('td[2]/a/@id').extract_first(default='').strip(),
                'id': tr.xpath('td[2]/a/@id').extract_first(),
                'registryNo': tr.xpath('td[2]/a/text()').re_first(r'\w+'),
                'status': tr.xpath('td[3]/a/text()'),
                'drugName': tr.xpath('td[4]/text()'),
                'shutZ': tr.xpath('td[5]/text()'),
                'title': tr.xpath('td[6]/text()'),
            }
            print(itemRow)

            l = ItemLoader(item=DemoItem(), response=response)
            l.add_xpath('name', '')
            l.add_css('ff', '')
            return l.load_item()
            #yield from response.follow_all(anchors, callback=self.parse)

            #     next_page = response.css('li.next a::attr(href)').get()
            # if next_page is not None:
            #     next_page = response.urljoin(next_page)
            #     yield scrapy.Request(next_page, callback=self.parse)
            # yield response.follow(href, callback=self.parse)
            yield itemRow
예제 #11
0
 def parse(self, response):   # 新闻列表 有完整新闻
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     last_pub_time= Util.format_time2(soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div > div h5')[-1].text)
     if self.time is None or Util.format_time3(last_pub_time) >= int(self.time):
         all_pub_time = [Util.format_time2(i.text) for i in soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div > div h5')]
         all_title = [i.text.strip() for i in soup.select('.lk-tle')]
         all_images = ['https://www.myanmarisis.org'+i.get('src') for i in soup.select('.img-responsive.lk-img')]
         all_body = [i.text.strip() for i in soup.select('#page-content-wrapper > div.container.section-1 > div > div.col-lg-9.col-md-9.col-sm-9.col-xs-12 > div p')]
         for i in range(9):
             item=DemoItem()
             item['pub_time'] = all_pub_time[i]
             item['images'] = [all_images[i]]
             item['title'] = all_title[i]
             item['body'] = all_body[i]
             item['category1']='event'
             item['category2'] =None
             item['abstract'] = all_body[i].split('\n')[0]
             yield item
     else:
         self.logger.info('时间截止!')
         flag = False
     if flag:
         try:
             nextPage=soup.select_one('.active ~ li a').get('href')
             yield Request(url=nextPage)
         except:
             self.logger.info("Next page no more.")
예제 #12
0
 def parse_detail(self, response):
     item = DemoItem()
     soup = BeautifulSoup(response.text)
     temp_time = soup.select_one('abbr.published').text if soup.select_one(
         'abbr.published').text else None
     adjusted_time = time_adjustment(temp_time)
     item['pub_time'] = adjusted_time
     image_list = []
     imgs = soup.select('.post-article img')
     for img in imgs:
         if img.get('src'):
             image_list.append(img.get('src'))
     item['images'] = image_list
     item['abstract'] = response.meta['abstract']
     item['body'] = soup.find(
         'div', class_="post-body entry-content").text if soup.find(
             'div', class_="post-body entry-content").text else None
     news_categories = soup.find(
         'div', class_="label-head Label").select('a') if soup.find(
             'div', class_="label-head Label").select('a') else None
     item['category1'] = news_categories[0].text if news_categories[
         0].text else None
     if len(news_categories) >= 2:
         item['category2'] = news_categories[1].text
     item['title'] = soup.find(
         'h1', class_="post-title entry-title").text if soup.find(
             'h1', class_="post-title entry-title").text else None
     yield item
예제 #13
0
    def parse_news(self, response):
        item = DemoItem()
        soup = bs(response.text)

        item["category1"] = soup.select_one("#m-bread2 > a").text
        item["category2"] = None
        title = soup.select_one("#landing-headline > h1").text
        item["title"] = title
        pub_time = soup.select("#m-pd2 > span")[-1].text
        item["pub_time"] = Util.format_time2(pub_time)
        images = [
            img.find("img").get("src")
            for img in soup.find_all(class_="wp-caption aligncenter")
        ] if soup.find_all(class_="wp-caption aligncenter") else []
        item["images"] = images
        abstract = soup.find(
            id="article-content").find("p").text.strip() if soup.find(
                id="article-content").find("p") else None
        item["abstract"] = abstract
        body = ''
        if soup.find(id="article-content").find_all("p"):
            for p in soup.find(id="article-content").find_all("p"):
                body += (p.text.strip() + '\n')
        item["body"] = body

        yield item
예제 #14
0
 def parse_detail(self, response):
     item = DemoItem()
     soup = BeautifulSoup(response.text, features='lxml')
     item['pub_time'] = time_adjustment(soup.select_one('span.post_info_date').text.strip() if soup.select_one('span.post_info_date').text.strip() else None)
     image_list = []
     imgs = soup.find('div', class_="post_img static").select('img') if soup.find('div', class_="post_img static").select('img') else None
     if imgs:
         for img in imgs:
             if re.findall(r'data:image/gif',img.get('src')) == []:
                 image_list.append(img.get('src'))
         item['images'] = image_list
     p_list = []
     if soup.find('div', class_="post_header single").select('p'):
         all_p = soup.find('div', class_="post_header single").select('p')
         for paragraph in all_p:
             p_list.append(paragraph.text)
         body = '\n'.join(p_list)
         item['abstract'] = p_list[0]
         item['body'] = body
     else:
         item['abstract'] = soup.find('div', class_="post_header single").select_one('h1').text if soup.find('div', class_="post_header single").select_one('h1').text else None
         item['body'] = soup.find('div', class_="post_header single").select('h2')[-1].text if soup.find('div', class_="post_header single").select('h2')[-1].text else None
     item['category1'] = soup.select_one('div.breadcrumb').select('a')[-1].text if soup.select_one('div.breadcrumb').select('a')[-1].text else None
     item['title'] = soup.select_one('div.post_header_title h1').text if soup.select_one('div.post_header_title h1').text else None
     yield item
예제 #15
0
    def parse_details(self,response):
        item=DemoItem()
        soup=BeautifulSoup(response.text,'lxml')
        item['category1']=response.meta['category1']
        item['category2']=response.meta['category2']

        item['title']=soup.find('h1',class_='entry-title').text.strip() if soup.find('h1',class_='entry-title') else None
    
        item['body'] = ''#不能忘记初始化
        body_list=soup.find('div',class_='entry clearfix').select('p') if soup.find('div',class_='entry clearfix').select('p')else None
        for body in body_list:
            item['body'] += body.text.strip()
            item['body'] +='\n'
        item['abstract']=soup.find('div',class_='entry clearfix').select('p')[0].text.strip() if soup.find('div',class_='entry clearfix').select('p') else None
   
        item['images']=[]
        image_list=soup.find('div',class_='entry clearfix').select('p>img')if soup.find('div',class_='entry clearfix').select('p>img') else None
        if(image_list):
            for image in image_list:
                image=image.get('src')
                item['images'].append(image)


        pub=soup.find('span',class_='updated').text.strip() if soup.find('span',class_='updated').text.strip() else None
        if(pub):
            pub=Util.format_time2(pub)
            item['pub_time']=pub

        yield item
예제 #16
0
파일: spot.py 프로젝트: youuuoy/crawler
    def parse_news(self, response):
        item = DemoItem()
        soup = bs(response.text,"html.parser")

        category1 = soup.find("div","breadcrumbs header5").find("a").text.strip() if soup.find("div","breadcrumbs header5") else None
        item["category1"] = category1
        category2 = soup.find("div","breadcrumbs header5").find_all("a")[-1].text.strip() if soup.find("div","breadcrumbs header5") else None
        if category2 == category1:
            category2 = None
        item["category2"] = category2
        item["pub_time"] = response.meta["pub_time"]
        # title = soup.find("h1","gtm-articleContent title mb-2 header1").text.strip() if soup.find("h1","gtm-articleContent title mb-2 header1") else None
        # item["title"] = title
        item["title"] = response.meta["title"]
        images = [img.get("src") for img in soup.find("section","article-content data-artcl-cnt").find_all("img")] if soup.find("section","article-content data-artcl-cnt").find_all("img") else None
        item["images"] = images
        if soup.find("p", "blurb mb-2 header6"):
            abstract = soup.find("p","blurb mb-2 header6").text.strip()
        else:
             abstract = soup.find("section","article-content data-artcl-cnt").find("p").text.strip() if soup.find("section","article-content data-artcl-cnt") else None
        item["abstract"] = abstract
        body = ''
        if soup.find("section", "article-content data-artcl-cnt"):
            for p in soup.find("section", "article-content data-artcl-cnt").find_all("p"):
                body += p.text.strip()+'\n'
        else:
            body = None
        item["body"] = body
        self.logger.info(item)
        self.logger.info('\n')
예제 #17
0
    def parse_top(self, response):
        #rows = response.xpath('//div[@id="react-app"]/div[1]/div/div/div[2]/div[1]/div/div/div[2]/div/div[2]/div[3]/div[1]/div[2]/article')
        rows = \
        response.xpath\
        ('//div[@id="react-app"]/div[1]/div/div/div[2]/div[1]/div/div/div[2]/div/div[2]/div[3]/div[1]/div[2]/article//div[@class="srp-list-item-description column"]/a')
        for row in rows:
            #Model = row.xpath('.//div[@class="srp-list-item-description column"]/a//span[@class="srp-list-item-basic-info-model"]/text()').extract_first()
            #Price = row.xpath('.//div[@class="srp-list-item-description column"]/a//div[@class="price-flag"]/span/text()').extract_first();
            #Mileage = row.xpath('.//div[@class="srp-list-item-description column"]/a//span[@class="srp-list-item-basic-info-mileage"]/text()').extract_first();
            #featuresList = row.xpath('.//div[@class="srp-list-item-description column"]/a//span[@class="srp-list-item-special-features-value"]/text()').extract();
            Model = row.xpath(
                './/span[@class="srp-list-item-basic-info-model"]/text()'
            ).extract_first()
            Price = row.xpath(
                './/div[@class="price-flag"]/span/text()').extract_first()
            Mileage = row.xpath(
                './/span[@class="srp-list-item-basic-info-mileage"]/text()'
            ).extract_first()
            featuresList = row.xpath(
                './/span[@class="srp-list-item-special-features-value"]/text()'
            ).extract()
            [Body, Color, Engine] = featuresList

            item = DemoItem()
            item['Model'] = Model
            item['Price'] = Price
            item['Mileage'] = Mileage
            item['Body'] = Body
            item['Color'] = Color
            item['Engine'] = Engine
            yield item
예제 #18
0
파일: abs-cbn.py 프로젝트: ldqsss/crawler
 def parse3(self, response):
     html = BeautifulSoup(response.text)
     item = DemoItem()
     list = response.url.split('/')
     item['title'] = html.select('.news-title')[0].text
     item['category1'] = list[3]
     if re.findall(r'\d+', list[4]) == []:
         item['category2'] = list[4]
     item['body'] = ''
     for i in html.select('.article-content > p'):
         item['body'] += (i.text + '\n')
     if html.select('.article-content > p') != []:
         item['abstract'] = html.select('.article-content > p')[0].text
     self.logger.info(
         html.select('.timestamp-entry > .date-posted')[0].text)
     if html.select('.timestamp-entry > .date-posted') != []:
         item['pub_time'] = Util.format_time2(
             html.select('.timestamp-entry > .date-posted')[0].text)
     else:
         item['pub_time'] = Util.format_time()
     if html.select('.article-content > .embed-wrap img') != []:
         item['images'] = [
             html.select('.article-content > .embed-wrap img')
             [0].attrs['src'],
         ]
     yield item
예제 #19
0
파일: piagov.py 프로젝트: Doglen/crawler
 def parse_detail(self, response):
     item = DemoItem()
     html = BeautifulSoup(response.text, 'html.parser')
     item['category1'] = response.meta['category1']
     item['category2'] = response.meta['category2']
     if html.select_one("div.container h1") is not None:
         item['title'] = html.select_one("div.container h1").text
     item['body'] = ''
     if html.select("div.col-24 p"):
         bodies = html.select("div.col-24 p")
         b_list = [b.text for b in bodies]
         item['body'] = '\n'.join(b_list)
         item['abstract'] = bodies[0].text
     item['images'] = []
     if html.select("div.col-24 figure img"):
         images = html.select("div.col-24 figure img")
         for i in images:
             item['images'].append(i['src'])
     if html.select_one("p.byline span.date") is not None:
         ex = 'Published on (.*)'
         pub_time = html.select_one("p.byline span.date").text
         pub_time = re.findall(ex, pub_time)
         if pub_time:
             pub_time = pub_time[0]
             pub_time = Util.format_time2(pub_time)
             item['pub_time'] = pub_time
         else:
             item['pub_time'] = Util.format_time()
     else:
         item['pub_time'] = Util.format_time()
     yield item
예제 #20
0
    def parse_news(self, response):
        item = DemoItem()
        soup = bs(response.text, "html.parser")
        item["category1"] = response.meta["category1"]
        item["category2"] = response.meta["category2"]
        pub_time = soup.find(
            "span",
            "entry-meta-date updated").find("a").text.strip() if soup.find(
                "span", "entry-meta-date updated") else "0000-00-00 00:00:00"
        if pub_time:
            item["pub_time"] = Util.format_time2(pub_time)

        div = soup.find("div", class_="entry-content clearfix")
        images = [img.get("src") for img in div.find_all("img")
                  ] if div.find_all("img") else None
        item["images"] = images
        title = soup.find("h1", class_="entry-title").text.strip()
        item["title"] = title
        abstract1 = [a.text.strip() for a in div.find_all("li")
                     ] if div.find_all("li") else div.find("p").text.strip()
        abstract = ''
        for a in abstract1:
            abstract += a
        item["abstract"] = abstract
        body = [p.text.strip()
                for p in div.find_all("p")] if div.find_all("p") else None
        body = "\n".join(body)
        item["body"] = body
        yield item
예제 #21
0
 def parse(self, response):
     for el in response.xpath('//ul/li'):
         i = DemoItem()
         i['title'] = el.xpath('a/text()').extract()
         i['link'] = el.xpath('a/@href').extract()
         i['desc'] = el.xpath('text()').extract()
         yield i
예제 #22
0
    def parse_news(self, response):
        item = DemoItem()
        soup = bs(response.text, "html.parser")

        item["category1"] = response.meta["category1"]
        item["category2"] = response.meta["category2"]

        title = soup.find(class_="tdb-title-text").text.strip()
        item["title"] = title
        pub_time = soup.find(
            class_="entry-date updated td-module-date").text.strip()
        item["pub_time"] = Util.format_time2(pub_time)
        images = [
            soup.find("div",
                      "tdb-block-inner td-fix-index").find("img").get("src")
        ] if soup.find("div", "tdb-block-inner td-fix-index") else None
        if soup.find_all("div", "wp-block-image"):
            for img in soup.find_all("div", "wp-block-image"):
                images.append(img.find("img").get("src"))
        item["images"] = images
        abstract = soup.select_one(
            "div.wpb_wrapper > div > div > p").text.strip() if soup.select_one(
                "div.wpb_wrapper > div > div > p") else None
        item["abstract"] = abstract
        body = soup.find(
            class_="tdb-caption-text").text.strip() + '\n' if soup.find(
                class_="tdb-caption-text") else ''
        for p in soup.select("div.wpb_wrapper > div > div > p"):
            body += (p.text.strip() + '\n')
        item["body"] = body

        self.logger.info(item)
        self.logger.info('\n')

        yield item
예제 #23
0
    def parse_2(self, response, **kwargs):
        item = DemoItem()
        new_soup = BeautifulSoup(response.text, 'lxml')
        item['title'] = new_soup.find('div', class_='topHeading',
                                      id='12').h1.string

        bodys = new_soup.select('article .articleBody > p')
        #文章内容
        all_body = ''
        for body_1 in bodys:
            all_body += body_1.text
        item['body'] = all_body
        item['pub_time'] = time_font(
            new_soup.select('.articleHd .dateInfo .fl')[0].text).strip("\t")
        #图片url
        image_list = []
        if len(new_soup.find_all('img', id='jagran_image_id')):
            for image in new_soup.find_all('img', id='jagran_image_id'):
                image_list.append(
                    new_soup.find("body").select_one(
                        ".container .ls-area-body article .bodySummery").find(
                            "img").get("data-src"))
        item['images'] = image_list
        item['category1'] = new_soup.select(
            'aside.breadcrum li.first > a > span')[0].string
        item['category2'] = new_soup.select(
            'aside.breadcrum li:nth-of-type(3) > a > span')[0].text
        item['abstract'] = new_soup.select(
            'aside.breadcrum li:nth-of-type(4) > span')[0].text
        yield item
예제 #24
0
 def parse_detail(self, response):
     item = DemoItem()
     soup = BeautifulSoup(response.text, features='lxml')
     if soup.select_one('div.date_and_author_container span').text.split(" ")[1]:
         temp_time = soup.select_one('div.date_and_author_container span').text.split(" ")[1]
     else:
         temp_time = soup.select_one('td.miscinfo').text.split(" ")[1]
     item['pub_time'] = time_adjustment(temp_time)
     image_list = []
     imgs = soup.select('div[align="center"] img') if soup.select('div[align="center"] img') else None
     if imgs:
         for img in imgs:
             image_list.append(img.get('src'))
         item['images'] = image_list
     p_list = []
     if soup.select('div.newscontent p'):
         all_p = soup.select('div.newscontent p')
     else:
         all_p = soup.select('div[align="justify"]')
     for paragraph in all_p:
         p_list.append(paragraph.text)
     body = '\n'.join(p_list)
     item['abstract'] = p_list[0]
     item['body'] = body
     item['category1'] = response.meta['category']
     item['title'] = soup.select_one('div.heading_container').text if soup.select_one('div.heading_container').text else None
     yield item
예제 #25
0
파일: tv9hindi.py 프로젝트: gitzdx/crawler
 def parse_detail(self, response):
     item = DemoItem()
     soup = soup = BeautifulSoup(response.text, features="lxml")
     item['title'] = soup.select_one(".detailBody").find("div", class_="LeftCont content").find(
         "h1").text.strip() if soup.select_one(".detailBody") else None
     images = []
     image = soup.select_one(".ArticleBodyCont .articleImg").find_all("img") if soup.select_one(
         ".ArticleBodyCont .articleImg") else None
     for img in image:
         images.append(img.get("data-src"))
     item['images'] = images
     pub_time = soup.find("div", class_="LeftCont content").find("ul", class_="AuthorInfo").find_all("li")[
         -1].text.strip() if soup.find("div", class_="LeftCont content") else None
     item['pub_time'] = tv9hindi_time_switch2(pub_time)
     item['abstract'] = soup.find("div", class_="LeftCont content").find_all("p")[1].text.strip() if soup.find("div",
                                                                                                               class_="LeftCont content") else None
     body_content = soup.find("div", class_="ArticleBodyCont").find_all("p") if soup.find("div", class_="ArticleBodyCont") else None
     body = ""
     mx = '<p><span style="color: #0000ff;">'  # 过滤
     for p in body_content:
         if re.match(mx, str(p)) is None:
             body += p.text.strip()
             body += "\n"
         else:
             pass
     category = soup.find("div", class_="breadcrum").select_one("#breadcrumbs").find_all("a")[-2:] if soup.find("div", class_="breadcrum") else None
     item['category1'] = category[0].text.strip()
     item['category2'] = category[1].text.strip()
     item['body'] = body
     item['request_url'] = response.request.url
     item['response_url'] = response.url
     item['website_id'] = self.website_id
     item['language_id'] = self.language_id
     item['cole_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(time.time())))
     yield item
예제 #26
0
 def parse(self, response):
     html = BeautifulSoup(response.text)
     if response.url == 'https://www.sunstar.com.ph/Philippines':
         for i in html.select('.tablecenter > a')[0:8]:
             yield Request(i.attrs['href'])
     elif re.findall(
             r'https://www.sunstar.com.ph/article/\d+/\S+?/\S+?/\S+?',
             response.url) != []:
         item = DemoItem()
         list = response.url.split('/')
         item['title'] = html.select('.titleArticle > h1')[0].text
         item['category1'] = list[5]
         if re.findall(r'\d+', list[6]) == []:
             item['category2'] = list[6]
         item['body'] = html.select('.col-sm-11 p')[0].text
         item['abstract'] = html.select('.col-sm-11 p')[0].text
         item['pub_time'] = Util.format_time2(
             html.select('.articleDate')[0].text)
         if html.select('.imgArticle > img') != []:
             item['images'] = [
                 html.select('.imgArticle > img')[0].attrs['src'],
             ]
         yield item
     else:
         for i in html.select('.sectionTopWidget > div > div .ratio'):
             yield Request(i.attrs['href'])
         for i in html.select(
                 '.moreSectionWidget > div > div a[class="title-C20 title blu-hover"]'
         ):
             yield Request(i.attrs['href'])
예제 #27
0
    def parse_detail(self, response):
        item = DemoItem()
        soup = BeautifulSoup(response.text, features='lxml')
        temp_time = soup.find(
            'span', class_="thetime date updated").text if soup.find(
                'span', class_="thetime date updated").text else None
        adjusted_time = time_adjustment(temp_time)
        if self.time == None or Util.format_time3(adjusted_time) >= int(
                self.time):
            all_text = soup.select_one('div.thecontent').text.replace(
                '\nAdvertisements\n',
                '') if soup.select_one('div.thecontent').text.replace(
                    '\nAdvertisements\n', '') else None
            item['body'] = all_text

            item['pub_time'] = adjusted_time
            item['abstract'] = soup.select_one(
                'div.thecontent p').text if soup.select_one(
                    'div.thecontent p').text else None
            item['category1'] = soup.select_one(
                'span.thecategory').text if soup.select_one(
                    'span.thecategory').text else None
            item['title'] = soup.select_one(
                'header h1').text if soup.select_one(
                    'header h1').text else None
            yield item
        else:
            self.logger.info("时间截止")
예제 #28
0
 def parse_2(self, response, **kwargs):
     page_soup = BeautifulSoup(response.text, 'lxml')
     category1 = page_soup.select(
         'div.main-title-outer.pull-left div.main-title')[0].text.strip()
     item = DemoItem()
     item['category1'] = category1
     item['category2'] = category1
     for i in page_soup.select(
             'div.col-md-4.col-sm-8.col-xs-16 div.topic.nt_topic a'):
         yield Request(i.attrs['href'],
                       callback=self.parse_3,
                       meta={'item': item})
     if page_soup.select('div.ntdv_pagination li')[-1].find(
             'a').attrs['href']:
         next_page = response.url + page_soup.select(
             'div.ntdv_pagination li')[-1].find('a').attrs['href']
         last_news_url = BeautifulSoup(
             requests.get(next_page).text, 'lxml').select(
                 'div.col-md-4.col-sm-8.col-xs-16 div.topic.nt_topic a'
             )[-1].attrs['href']
         last_time = time_font(
             BeautifulSoup(requests.get(last_news_url).text,
                           'lxml').select('div.time')[0].text)
         if self.time == None or Util.format_time3(last_time) >= int(
                 self.time):  # 截止功能
             #下一页
             yield Request(next_page, callback=self.parse_2)
         else:
             self.logger.info('时间截止')
예제 #29
0
파일: fili.py 프로젝트: Doglen/crawler
    def parse_item(self, response):
        soup = BeautifulSoup(response.text, 'html.parser')
        item = DemoItem()
        category = soup.select('div.breadcrumbs > a')
        if len(category) == 1:
            item['category1'] = category[0].text
            item['category2'] = None
        else:
            item['category1'] = category[0].text
            item['category2'] = category[1].text

        item['title'] = soup.select('div.breadcrumbs > span')[-1].text
        ttt = soup.select('dd.published')[0].text.split(',')[1].split(' ')[1:]
        datetime = ttt[2] + '-' + str(
            Util.month[ttt[1]]) + '-' + ttt[0] + ' ' + ttt[-1][:5] + ':00'
        item['pub_time'] = datetime
        item['images'] = None
        item['abstract'] = soup.select('div.item-page > p')[0].text

        ss = ''
        for i in soup.select('div.item-page > p'):
            ss += i.text + r'\n'
        item['body'] = ss

        yield item
예제 #30
0
 def parse2(self, response):
     item = DemoItem()
     html = BeautifulSoup(response.text)
     list = response.url.split('/')
     item['title'] = html.select('.title')[0].text
     item['category1'] = list[3]
     if re.findall(r'\d+', list[4]) == []:
         item['category2'] = list[4]
     item['body'] = ''
     flag = False
     for i in html.select('#content-body-244757-498257 > p'):
         item['body'] += (i.text + '\n')
         if i.text != '' and flag == False:
             flag = True
             item['abstract'] = i.text
     if html.select('.dateLine > p') != []:
         item['pub_time'] = Util.format_time2(
             html.select('.dateLine > p')[0].text)
     elif html.select('.dateString') != []:
         item['pub_time'] = Util.format_time2(
             html.select('.dateString')[0].text)
     if html.select('.margin-bottom-15 img') != []:
         item['images'] = [
             'https://www.cnnphilippines.com' +
             html.select('.margin-bottom-15 img')[0].attrs['src'],
         ]
     yield item