示例#1
0
文件: WHO.py 项目: ldqsss/crawler
 def _4matTime(self, time):
     timels = time.strip().split()
     time = timels[1] + ' ' + timels[0] + ' ' + timels[2]
     return Util.format_time2(time)
示例#2
0
from selenium import webdriver
import unittest
from ddt import data, ddt
import time
from demo.util import Util

testdata = Util.read_excel("D:/python-webUI-auto/data/data.xlsx", "Sheet1")


@ddt
class Search_by_ddt(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.Chrome("../tools/chromedriver.exe")
        self.driver.maximize_window()
        self.driver.get("https://www.baidu.com")
        self.driver.implicitly_wait(5)

    @data(*testdata)
    def test_search_by_ddt(self, data):
        search_string = data["content"]
        print("搜索内容->:%s" % search_string)
        search_input = self.driver.find_element_by_id('kw')

        # 找到后,键入 java 并提交搜索
        search_input.send_keys(search_string)
        time.sleep(3)
        search_input.submit()

    def tearDown(self):
        """测试结束后的操作,这里基本上都是关闭浏览器"""
        self.driver.quit()
示例#3
0
    def parse_news(self, response):
        soup = BeautifulSoup(response.text, "html.parser")
        # 发布时间
        pub_time_list = re.split(
            " |,",
            soup.select_one("h2.page-header>small").text) if soup.select_one(
                "h2.page-header>small") else None
        time2 = Util.format_time()
        if pub_time_list:
            if pub_time_list[-4] == "January":
                time2 = pub_time_list[-1] + "-01-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "February":
                time2 = pub_time_list[-1] + "-02-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "March":
                time2 = pub_time_list[-1] + "-03-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "April":
                time2 = pub_time_list[-1] + "-04-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "May":
                time2 = pub_time_list[-1] + "-05-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "June":
                time2 = pub_time_list[-1] + "-06-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "July":
                time2 = pub_time_list[-1] + "-07-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "August":
                time2 = pub_time_list[-1] + "-08-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "September":
                time2 = pub_time_list[-1] + "-09-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "October":
                time2 = pub_time_list[-1] + "-10-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "November":
                time2 = pub_time_list[-1] + "-11-" + pub_time_list[
                    -3] + " 00:00:00"
            elif pub_time_list[-4] == "December":
                time2 = pub_time_list[-1] + "-12-" + pub_time_list[
                    -3] + " 00:00:00"
        pub_time = time2
        # 标题
        temp = soup.select_one("h2.page-header")
        [s.extract() for s in temp('small')]
        title = temp.text.strip()
        # 正文
        body_list2 = []
        body_list = re.split("\r\n|\n",
                             soup.select_one("div.col-md-12>p").text.strip())
        for b in body_list:
            if b:
                body_list2.append(b)
        body = "\n".join(body_list2)
        # 摘要
        abstract = body_list2[0]
        # 图片
        images = []
        temp_list = soup.select("center>img")
        for t in temp_list:
            images.append("http://www.tourism.gov.ph" + t.get("src"))

        item = DemoItem()
        item["category1"] = "News Updates"
        item["category2"] = "Featured News"
        item["pub_time"] = pub_time
        item["title"] = title
        item["abstract"] = abstract
        item["body"] = body
        item["images"] = images

        yield item
示例#4
0
def nayalook_time_switch1_2(time_string):
    # 3 days ago
    # 返回时间戳
    return Util.format_time3(str(Util.format_time2(time_string)))
示例#5
0
def nayalook_time_switch2(time_string):
    # 返回%Y-%m-%d %H:%M:%S
    # 3 days ago
    return Util.format_time2(time_string)
示例#6
0
    def parse(self, response, **kwargs):
        header = {
            'Accept': 'application/json,text/javascript, */*; q=0.01',
            'Accept-Encoding': 'gzip, deflate, br',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'Connection': 'keep-alive',
            'Content-Length': '11',
            'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
            'Cookie': 'PHPSESSID=h2q86fctchauhq3ngeg8cu2ld7',
            'Host': 'www.macaupostdaily.com',
            'Origin': 'https://www.macaupostdaily.com',
            'Referer': 'https://www.macaupostdaily.com/',
            'sec-ch-ua':
            'Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
            'sec-ch-ua-mobile': '?0',
            'Sec-Fetch-Dest': 'empty',
            'Sec-Fetch-Mode': 'cors',
            'Sec-Fetch-Site': 'same-origin',
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'X-Requested-With': 'XMLHttpRequest'
        }
        url = 'https://www.macaupostdaily.com/'
        url_list = []
        time_list = []
        title_list = []
        img_list = []

        news_soup = BeautifulSoup(response.text, 'lxml')
        for i in news_soup.find('ul', class_='new_list',
                                id='fu').find_all('li'):
            url_list.append('https://www.macaupostdaily.com' +
                            i.find('a').get('href'))
            time_list.append(
                i.find('div', class_='time').text.strip('\n').strip(' ') +
                ":00")
            title_list.append(i.find('strong').text.strip('\n'))
            img_list.append(url + i.find('img').get('src'))

        request_url = 'https://www.macaupostdaily.com/index.php/Article/news_list'

        # 后面post得到的url
        i = 2
        while (i):
            Data = {'cid': '', 'page': "%d" % i}

            rep = requests.post(url=request_url, data=Data,
                                headers=header).json()
            for list in rep['list']:
                url_list.append("https://www.macaupostdaily.com/article" +
                                list['id'] + ".html")
                title_list.append(list['title'])
                time_list.append(list['time'] + ":00")
                img_list.append('https://www.macaupostdaily.com' + list['img'])
            for new in range(0, len(url_list)):
                if self.time == None or Util.format_time3(
                        time_list[new]) >= int(self.time):
                    yield Request(url_list[new],
                                  callback=self.parse_2,
                                  meta={
                                      'time': time_list[new],
                                      'title': title_list[new],
                                      'img': img_list[new]
                                  })
            if Util.format_time3(time_list[-1]) < int(self.time):
                break
            url_list = []
            time_list = []
            img_list = []
            title_list = []
            i = i + 1
示例#7
0
def nayalook_time_switch1(time_string):
    # 30/11/2020
    # 返回时间戳
    return Util.format_time3(str(datetime.strptime(time_string, "%d/%m/%Y")))
示例#8
0
def nhandan_time_switch1(time_string):
    # 2020年12月25日 星期五
    # 返回时间戳
    time_string = time_string.rsplit(" ", 1)[0]
    return Util.format_time3(str(datetime.strptime(time_string, "%Y年%m月%d日")))
示例#9
0
 def parse_2(self, response, **kawargs):
     page = BeautifulSoup(response.text, 'lxml')
     category1 = page.find('h1', class_='page-title').text
     if page.find('ul', id='posts-container', class_='posts-items') != None:
         for i in page.find('ul',
                            id='posts-container',
                            class_='posts-items').find_all(
                                'a', class_='post-thumb'):
             images = i.find('img').get('data-src')
             yield Request(i.attrs['href'],
                           callback=self.parse_3,
                           meta={
                               'images': images,
                               'category1': category1
                           })
     else:
         for i in page.find(
                 'div', class_='masonry-grid-wrapper masonry-with-spaces'
         ).find_all('div', class_='featured-area'):
             images = i.find('img').get('data-src')
             yield Request(i.find('a').get('href'),
                           callback=self.parse_3,
                           meta={
                               'images': images,
                               'category1': category1
                           })
     #看能否爬下一页
     if page.find('span', class_='last-page first-last-pages') != None:
         next_page = page.find(
             'span',
             class_='last-page first-last-pages').find('a').attrs['href']
         if page.find('div', class_='year-month') != None:
             time = page.find('div', class_='year-month').find('em').text.strip('-').strip(' ') + ' ' + \
                    page.find('div', class_='mag-box-container clearfix').find_all('div', class_='day-month')[
                        -1].text
             pub_time = time_font_2(time)
         elif page.find(
                 'div',
                 class_='masonry-grid-wrapper masonry-with-spaces') != None:
             pub_time = time_font(
                 page.find(
                     'div',
                     class_='masonry-grid-wrapper masonry-with-spaces').
                 find_all('span',
                          class_='date meta-item tie-icon')[-1].text)
         elif page.find('ul', id='posts-container',
                        class_='posts-items') != None:
             pub_time = time_font(
                 page.find('ul', id='posts-container',
                           class_='posts-items').find_all(
                               'span',
                               class_='date meta-item tie-icon')[-1].text)
         if self.time == None or Util.format_time3(pub_time) >= int(
                 self.time):
             yield Request(next_page, callback=self.parse_2)
     #这是第二种二级目录
     elif page.find('li', class_='the-next-page') != None:
         next_page = page.find(
             'li', class_='the-next-page').find('a').attrs['href']
         if page.find('div', class_='year-month') != None:
             time = page.find('div', class_='year-month').find('em').text.strip('-').strip(' ') + ' ' + \
                    page.find('div', class_='mag-box-container clearfix').find_all('div', class_='day-month')[
                        -1].text
             pub_time = time_font_2(time)
         elif page.find(
                 'div',
                 class_='masonry-grid-wrapper masonry-with-spaces') != None:
             pub_time = time_font(
                 page.find(
                     'div',
                     class_='masonry-grid-wrapper masonry-with-spaces').
                 find_all('span',
                          class_='date meta-item tie-icon')[-1].text)
         elif page.find('ul', id='posts-container',
                        class_='posts-items') != None:
             pub_time = time_font(
                 page.find('ul', id='posts-container',
                           class_='posts-items').find_all(
                               'span',
                               class_='date meta-item tie-icon')[-1].text)
             if self.time == None or Util.format_time3(pub_time) >= int(
                     self.time):
                 yield Request(next_page, callback=self.parse_2)
示例#10
0
文件: dfa.py 项目: Doglen/crawler
 def parse_news_list(self, response):
     soup = BeautifulSoup(response.text, "html.parser")
     news_list = soup.select("tbody>tr") if soup.select("tbody>tr") else []
     time2 = None
     for news in news_list:
         url = "https://dfa.gov.ph" + news.select_one("a").get("href")
         pub_time_list = news.find(
             "td",
             class_="list-date small").text.strip().split(" ") if news.find(
                 "td", class_="list-date small") else None
         if pub_time_list:
             if pub_time_list[1] == "January":
                 time2 = pub_time_list[2] + "-01-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "February":
                 time2 = pub_time_list[2] + "-02-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "March":
                 time2 = pub_time_list[2] + "-03-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "April":
                 time2 = pub_time_list[2] + "-04-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "May":
                 time2 = pub_time_list[2] + "-05-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "June":
                 time2 = pub_time_list[2] + "-06-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "July":
                 time2 = pub_time_list[2] + "-07-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "August":
                 time2 = pub_time_list[2] + "-08-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "September":
                 time2 = pub_time_list[2] + "-09-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "October":
                 time2 = pub_time_list[2] + "-10-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "November":
                 time2 = pub_time_list[2] + "-11-" + pub_time_list[
                     0] + " 00:00:00"
             elif pub_time_list[1] == "December":
                 time2 = pub_time_list[2] + "-12-" + pub_time_list[
                     0] + " 00:00:00"
         response.meta["pub_time"] = time2
         yield scrapy.Request(url,
                              meta=response.meta,
                              callback=self.parse_news)
     next_page = "https://dfa.gov.ph" + soup.select_one(
         "li.pagination-next>a").get("href") if soup.select_one(
             "li.pagination-next>a") else None
     if self.time == None or (time2 and
                              Util.format_time3(time2) >= int(self.time)):
         if next_page:
             yield scrapy.Request(next_page,
                                  meta=response.meta,
                                  callback=self.parse_news_list)
     else:
         self.logger.info('时间截止')
示例#11
0
 def parse_news_list(self, response):
     soup = BeautifulSoup(response.text, "html.parser")
     # 每页第一条新闻
     web = soup.find("td", {
         "valign": "top"
     }).select_one("td>font>a").text.strip() if soup.find(
         "td", {
             "valign": "top"
         }).select_one("td>font>a").text else None
     if web and web == "PhilBoxing.com":
         url = soup.find("td", {
             "valign": "top"
         }).select_one("td>a").get("href") if soup.find(
             "td", {
                 "valign": "top"
             }).select_one("td>a").get("href") else None
         abstract = soup.find("td", {
             "valign": "top"
         }).select_one("td>font.newsblurb").text.strip().split(
             "\r\n\r\n") if soup.find("td", {
                 "valign": "top"
             }).select_one("td>font.newsblurb").text else None
         response.meta["abstract"] = ' '.join(
             abstract) if abstract else None
         if url:
             yield scrapy.Request(url,
                                  meta=response.meta,
                                  callback=self.parse_news)
     # 除第一条新闻的其他新闻
     table = soup.find("table", {
         "width": "100%",
         "height": "100%"
     }) if soup.find("table", {
         "width": "100%",
         "height": "100%"
     }) else None
     p = table.select("p")[2] if table and table.select("p")[2] else None
     web_list = p.select("p>font>a") if p and p.select("p>font>a") else None
     news_list = p.select("p>a") if p and p.select("p>a") else None
     abstract_list = p.select(
         "p>font.newsblurb") if p and p.select("p>font.newsblurb") else None
     i = 0
     if web_list:
         for web in web_list:
             if web.text.strip() == "PhilBoxing.com":
                 url = news_list[2 *
                                 i].get("href") if news_list and news_list[
                                     2 * i].get("href") else None
                 abstract = abstract_list[i].text.strip().split(
                     "\r\n\r\n"
                 ) if abstract_list and abstract_list[i].text else None
                 response.meta["abstract"] = ' '.join(
                     abstract) if abstract else None
                 if url:
                     yield scrapy.Request(url,
                                          meta=response.meta,
                                          callback=self.parse_news)
                 i += 1
             else:
                 i += 1
     # 翻页
     time_list = p.find_all("font", {"size": "2"})[-1].text.split(" ")
     if time_list:
         if time_list[-2] == "Jan":
             time = time_list[-1] + "-01-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Feb":
             time = time_list[-1] + "-02-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Mar":
             time = time_list[-1] + "-03-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Apr":
             time = time_list[-1] + "-04-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "May":
             time = time_list[-1] + "-05-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Jun":
             time = time_list[-1] + "-06-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Jul":
             time = time_list[-1] + "-07-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Aug":
             time = time_list[-1] + "-08-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Sept":
             time = time_list[-1] + "-09-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Oct":
             time = time_list[-1] + "-10-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Nov":
             time = time_list[-1] + "-11-" + time_list[-3] + " 00:00:00"
         elif time_list[-2] == "Dec":
             time = time_list[-1] + "-12-" + time_list[-3] + " 00:00:00"
         else:
             time = None
         if time and (self.time == None
                      or Util.format_time3(time) >= int(self.time)):
             font_list = soup.select("font.boxertablebody") if soup.select(
                 "font.boxertablebody") else None
             a_list = font_list[-1].select(
                 "a") if font_list and font_list[-1].select("a") else None
             next_page = "http://philboxing.com/news/" + a_list[0].get(
                 "href") if a_list and a_list[0].get("href") else None
             if next_page:
                 yield scrapy.Request(next_page,
                                      meta=response.meta,
                                      callback=self.parse_news_list)
         else:
             self.logger.info('时间截止')
示例#12
0
    def parse_news(self, response):
        item = DemoItem()
        item["category1"] = response.meta["category1"]
        item["category2"] = response.meta["category2"]
        response1 = response.text.replace('<br>', ' ')
        soup = BeautifulSoup(response1, "html.parser")
        # 时间
        temp = soup.select_one("div.title_text") if soup.select_one(
            "div.title_text") else None
        pub_time_list = re.split(
            " |,",
            temp.select_one("p").text) if temp.select_one("p").text else None
        if pub_time_list:
            if pub_time_list[-5] == "Jan":
                time = pub_time_list[-4] + "-01-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Feb":
                time = pub_time_list[-4] + "-02-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Mar":
                time = pub_time_list[-4] + "-03-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Apr":
                time = pub_time_list[-4] + "-04-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "May":
                time = pub_time_list[-4] + "-05-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Jun":
                time = pub_time_list[-4] + "-06-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Jul":
                time = pub_time_list[-4] + "-07-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Aug":
                time = pub_time_list[-4] + "-08-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Sept":
                time = pub_time_list[-4] + "-09-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Oct":
                time = pub_time_list[-4] + "-10-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Nov":
                time = pub_time_list[-4] + "-11-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            elif pub_time_list[-5] == "Dec":
                time = pub_time_list[-4] + "-12-" + pub_time_list[
                    -6] + " " + pub_time_list[-2] + ":00"
            item["pub_time"] = time

            if self.time == None or Util.format_time3(time) >= int(self.time):
                # 标题
                item["title"] = temp.find("a").text.strip() if temp.find(
                    "a").text else None
                # 摘要和正文
                body = []
                temp_list = soup.select_one("div.detail_text").find_all(
                    "p") if soup.select_one("div.detail_text").find_all(
                        "p") else None
                if temp_list:
                    for temp in temp_list:
                        body.append(temp.text.strip())
                    item["abstract"] = body[0]
                    item["body"] = "\n".join(body)
                else:
                    item["abstract"] = None
                    item["body"] = None
                # 图片
                images = []
                image_list = soup.select("div.article_image") if soup.select(
                    "div.article_image") else None
                if image_list:
                    for image in image_list:
                        images.append(image.find("img").get("src"))
                item["images"] = images
                yield item
            else:
                self.logger.info('时间截止')
示例#13
0
 def parse_news(self, response):
     item = DemoItem()
     item["category1"] = response.meta["category1"]
     item["category2"] = response.meta["category2"]
     item["pub_time"] = response.meta["pub_time"] if response.meta["pub_time"] else Util.format_time()
     soup = BeautifulSoup(response.text, "html.parser")
     temp = soup.find("div", {"itemprop": "articleBody"}) if soup.find("div", {"itemprop": "articleBody"}) else None
     temp1 = temp.find("p", {"style": "text-align: center;"}) if temp and temp.find("p", {"style": "text-align: center;"}) else None
     item["title"] = temp1.text.strip() if temp1 and temp1.text else None
     body = []
     temp2_list = temp.find_all("p", {"style": "text-align: justify;"}) if temp.find_all("p", {"style": "text-align: justify;"}) else []
     for temp2 in temp2_list:
         [s.extract() for s in temp2('script')]
         b = temp2.get_text().strip().split('\xa0') if temp2.text else None
         b = ' '.join(b) if b else None
         if b:
             body.append(b)
     item["abstract"] = body[0] if body else None
     item["body"] = '\n'.join(body) if body else None
     images = []
     temp3_list = temp.find_all("p", {"style": "text-align: center;"}) if temp and temp.find_all("p", {"style": "text-align: center;"}) else []
     for temp3 in temp3_list:
         image = "https://dfa.gov.ph" + temp3.find("img").get("src") if temp3.find("img") and temp3.find("img").get("src") else None
         if image:
             images.append(image)
     item["images"] = images
     self.logger.info(response.meta["pub_time"])
     self.logger.info(item)
     # yield item
示例#14
0
 def parse_news(self, response):
     item = DemoItem()
     soup = BeautifulSoup(response.text, "html.parser")
     # 文章发布时间
     temp = soup.find("time", {"class": "css-1sbuyqj"}) if soup.find(
         "time", {"class": "css-1sbuyqj"}) else None
     temp_text = temp.text.strip() if temp and temp.text else None
     time_list = re.split(",| ", temp_text) if temp_text else None
     time2 = Util.format_time()
     if time_list:
         if time_list[3] == "Jan":
             time2 = time_list[5] + "-01-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Feb':
             time2 = time_list[5] + "-02-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Mar':
             time2 = time_list[5] + "-03-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Apr':
             time2 = time_list[5] + "-04-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'May':
             time2 = time_list[5] + "-05-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Jun':
             time2 = time_list[5] + "-06-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Jul':
             time2 = time_list[5] + "-07-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Aug':
             time2 = time_list[5] + "-08-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Sept':
             time2 = time_list[5] + "-09-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Oct':
             time2 = time_list[5] + "-10-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Nov':
             time2 = time_list[5] + "-11-" + time_list[1] + " " + time_list[
                 6] + ":00"
         elif time_list[3] == 'Dec':
             time2 = time_list[5] + "-12-" + time_list[1] + " " + time_list[
                 6] + ":00"
     item["pub_time"] = time2
     # 文章图片
     images = []
     img = soup.select_one("picture>img").get("src") if soup.select_one(
         "picture>img") else None
     if img:
         images.append(img)
     item["images"] = images
     # 文章内容
     body = []
     p_list = soup.select("p.css-158dogj")
     for p in p_list:
         if p.text:
             body.append(p.text.strip())
     item['body'] = "\n".join(body) if body else None
     # 文章摘要
     abstract = soup.find("p", {
         "id": "article-summary"
     }).text.strip() if soup.find("p", {"id": "article-summary"}) else ''
     if abstract == '' or abstract == '.':
         abstract = body[0] if body else None
     item["abstract"] = abstract
     # 一级目录
     item["category1"] = response.meta["category1"]
     # 二级目录
     item["category2"] = response.meta["category2"]
     # 文章标题
     item["title"] = soup.find("h1", {
         "id": "link-1b44e840"
     }).text.strip() if soup.find("h1", {"id": "link-1b44e840"}) else None
     yield item
示例#15
0
 def parse_3(self, response):
     item = response.meta['item']
     new_soup = BeautifulSoup(response.text)
     try:
         item['title'] = new_soup.select('div.sec-topic.nt_detailview.col-sm-16.wow.fadeInDown.animated div.col-sm-16.sec-info > h1')[0].text
         item['pub_time'] = time_font(new_soup.select('div.text-danger.sub-info-bordered div.time')[0].text) if len(new_soup.select('div.text-danger.sub-info-bordered div.time')) else Util.format_time()
         item['body'] = ''
         if len(new_soup.select('.col-sm-16.sec-info p')):
             for bodys in new_soup.select('.col-sm-16.sec-info p'):
                 item['body'] += bodys.text
         else:
             for bodys in new_soup.select('.carousel-caption p'):
                 item['body'] += bodys.text
         item['abstract'] = new_soup.select('.col-sm-16.sec-info p')[0].text if len(new_soup.select('.col-sm-16.sec-info p')) else new_soup.select_one('.carousel-caption p')
         item['images'] = []
         if len(new_soup.select(
                 'div.sec-topic.nt_detailview.col-sm-16.wow.fadeInDown.animated div.ntdv_imgcon > img')):
             new_images_list = new_soup.select(
                 'div.sec-topic.nt_detailview.col-sm-16.wow.fadeInDown.animated div.ntdv_imgcon > img')
             for new_images in new_images_list:
                 item['images'].append(new_images.get('src'))
     except:
         pass
     yield item
示例#16
0
def headlinehindi_time_switch1(time_string):
    # 2020-12-23T17:50:27+05:30
    # 返回时间戳
    time_string = time_string.rsplit("+", 1)[0]
    return Util.format_time3(
        str(datetime.strptime(time_string, "%Y-%m-%dT%H:%M:%S")))
示例#17
0
文件: doh.py 项目: Doglen/crawler
 def parse_news_list(self, response):
     home_url = 'https://doh.gov.ph/'
     time2 = ''
     soup = BeautifulSoup(response.text, "html.parser")
     news_list = soup.select("div.panel>div>div.view-content>div")
     # 新闻列表
     for news in news_list:
         # 发布日期和时间
         date = news.find("span",
                          class_="field-content content-time").text.strip()
         dtime = " 00:00:00"
         # 日期
         pub_time_list = re.split(" |,", date) if date else None
         if pub_time_list:
             if pub_time_list[0] == "January":
                 time2 = pub_time_list[-1] + "-01-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "February":
                 time2 = pub_time_list[-1] + "-02-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "March":
                 time2 = pub_time_list[-1] + "-03-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "April":
                 time2 = pub_time_list[-1] + "-04-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "May":
                 time2 = pub_time_list[-1] + "-05-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "June":
                 time2 = pub_time_list[-1] + "-06-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "July":
                 time2 = pub_time_list[-1] + "-07-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "August":
                 time2 = pub_time_list[-1] + "-08-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "September":
                 time2 = pub_time_list[-1] + "-09-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "October":
                 time2 = pub_time_list[-1] + "-10-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "November":
                 time2 = pub_time_list[-1] + "-11-" + pub_time_list[
                     1] + dtime
             elif pub_time_list[0] == "December":
                 time2 = pub_time_list[-1] + "-12-" + pub_time_list[
                     1] + dtime
         response.meta['pub_time'] = time2
         #新闻列表
         url = urljoin(home_url, news.find("a").get("href"))
         yield scrapy.Request(url,
                              meta=response.meta,
                              callback=self.parse_news)
     # 翻页
     next_page = "https://doh.gov.ph/" + soup.select_one(
         "li.pager-next>a").get("href") if soup.select_one(
             "li.pager-next>a") else None
     if self.time == None or (time2 and
                              Util.format_time3(time2) >= int(self.time)):
         if next_page:
             yield scrapy.Request(next_page,
                                  meta=response.meta,
                                  callback=self.parse_news_list)
     else:
         self.logger.info('time out')
示例#18
0
文件: zeen.py 项目: ldqsss/crawler
 def parse_eassys(self, response):  # 各类二级目录的文章的翻页和url爬取
     soup = BeautifulSoup(response.text, 'html.parser')
     flag = True
     if re.match(r'.*photo-gallery.*', response.url):  # 照片的
         for t in soup.find_all(class_='col-sm-4 col-md-4 photo-photo-h'):
             try:
                 url = 'https://zeenews.india.com' + t.select_one('a').get(
                     'href')
             except:
                 continue
             response.meta['title'] = t.select_one('h3').text
             response.meta['images'] = [t.select_one('img').get('src')]
             response.meta['pub_time'] = t.select_one(
                 '.photo-date').text.strip()
             if self.time is None or Util.format_time3(
                     Util.format_time2(
                         t.select_one('.photo-date').text.strip())) >= int(
                             self.time):
                 yield Request(url,
                               callback=self.parse_item_photo,
                               meta=response.meta)
             else:
                 flag = False
                 self.logger.info('时间截止')
     elif re.match(r'.*video.*', response.url):  # 视频的
         for i in soup.find_all(
                 attrs={'class': 'mini-video mini-video-h margin-bt30px'
                        }):  # 该目录初始的文章
             url = 'https://zeenews.india.com' + i.select_one('a').get(
                 'href')
             #self.logger.info( url)
             response.meta['images'] = [i.select_one('img').get('src')]
             response.meta['title'] = i.select_one('h3').text
             response.meta['pub_time'] = i.select_one('.date').text.strip()
             if self.time is None or Util.format_time3(
                     Util.format_time2(
                         i.select_one('span.date').text.strip())) >= int(
                             self.time):
                 yield Request(url,
                               callback=self.parse_item_video,
                               meta=response.meta)
             else:
                 flag = False
                 self.logger.info('时间截止')
     else:
         for t in soup.find_all(
                 class_='section-article margin-bt30px clearfix'
         ):  # 该目录初始的文章
             url = 'https://zeenews.india.com' + t.select_one('a').get(
                 'href')
             response.meta['title'] = t.select_one('h3.margin-bt10px').text
             tt = t.select_one('span.date').text.strip().split()
             try:
                 pub_time = self.hindi_month[tt[0]] + ' ' + tt[
                     1] + ' ' + tt[2] + ' ' + tt[3] + ' ' + tt[5]
             except:
                 pub_time = t.select_one('span.date').text.strip()
             response.meta['pub_time'] = pub_time
             response.meta['images'] = [t.select_one('img').get('src')]
             if self.time is None or Util.format_time3(
                     Util.format_time2(pub_time)) >= int(self.time):
                 yield Request(url=url,
                               meta=response.meta,
                               callback=self.parse_item)
             else:
                 flag = False
                 self.logger.info('时间截止')
     if flag:
         try:
             nextPage = 'https://zeenews.india.com/' + soup.find(
                 class_='next last').select_one('a').get('href')
             yield Request(nextPage,
                           callback=self.parse_eassys,
                           meta=response.meta)
         except:
             self.logger.info('Next page no more!')