Exemplo n.º 1
0
def hello(request):
    context = {}
    maxDate = datetime.datetime.today()
    oneDay = datetime.timedelta(days=1)
    minDate = maxDate - oneDay
    oneHour = datetime.timedelta(hours=1)
    if 'minDate' in request.GET:
        minDate = datetime.datetime.strptime(request.GET['minDate'],
                                             '%Y-%m-%d')

    if 'maxDate' in request.GET:
        maxDate = datetime.datetime.strptime(request.GET['maxDate'],
                                             '%Y-%m-%d')

    minDate -= oneHour
    maxDate += oneHour

    #Read the json.
    db = connection.DB()
    result = db.select(
        'SELECT * FROM link WHERE dat >= ? AND dat <= ?',
        [minDate.strftime('%Y-%m-%d'),
         maxDate.strftime('%Y-%m-%d')])
    #Delete elements not in the range.
    elements = []
    for item in result:
        elements.append(model.Element(title=item[1], url=item[2],
                                      date=item[3]))
    elements.sort(key=functools.cmp_to_key(cmp))
    minDate += oneHour
    context['min'] = minDate.strftime('%Y-%m-%d')
    context['max'] = maxDate.strftime('%Y-%m-%d')
    context['item_list'] = elements
    context['len'] = len(elements)
    return render(request, 'hello.html', context)
Exemplo n.º 2
0
    def parse(self , response):
        base_url = "https://www.smashingmagazine.com"

        body = response.css('main section')
        res = body.css('nav ul')
        res2 = res.css('.pagination__next')
        urlDomLen = len(res2.css('a ::attr(href)'))
        if (urlDomLen != 1):
            return
        nextPageUrl = base_url + res2.css('a ::attr(href)')[0].extract()
        elements = body.css('.container .row .col article')
        #Get the date of yesterday, article before yesterday needn`t update.
        one_day = datetime.timedelta(days=1)
        yesterday = datetime.datetime.today() - one_day
        for item in elements:
            #Get the dom stores the title and the url.
            title_dom = item.css('h1 a')
            url = base_url + title_dom.css('::attr(href)').extract()[0]
            title = title_dom.css('::text').extract()[0]
            #Get the dom stores the date.
            date_dom = item.css('.article--post__content ')
            date = datetime.datetime.strptime(date_dom.css('time ::attr(datetime)').extract()[0] , '%Y-%m-%d')
            timedelta = datetime.timedelta(days=3)
            if date < datetime.datetime.today() - timedelta:
                return
            element = model.Element(title=title , date=date , url=url)
            print([title , date , url])
            element.save()
        #Call for the next page
        yield scrapy.Request(url = nextPageUrl , callback = self.parse)
Exemplo n.º 3
0
 def parse(self, response):
     element = response.css('li').css('a')
     for item in element:
         href = item.css('::attr(href)').extract()
         text = item.css('::text').extract()
         #Parse the date.
         dateList = str(href).split('/')
         #Maybe incorrect.
         year = dateList[-4]
         month = dateList[-3]
         day = 1
         date = datetime.date(int(year), int(month), day)
         timedelta = datetime.timedelta(days=3)
         if datetime.datetime.strptime(
                 str(date),
                 '%Y-%m-%d') < datetime.datetime.today() - timedelta:
             return
         element = model.Element(title=str(text[0][2:]),
                                 date=date,
                                 url=str(href[0]))
         exist = element.check_exist()
         #If this element doesn`t exist in the database, it means this is a new element.
         # if exist == False:
         #    element.set_date(datetime.date.today())
         element.save()
Exemplo n.º 4
0
    def parse(self, response):
        base_url = "http://thedesigninspi"
        doms = response.css(".views-row")
        for item in doms:
            date_info = item.css(
                "div[class = 'views-field views-field-created'] span ::text"
            ).extract()[0].split('|')[1]
            date = self.parse_date(date_info)
            timedelta = datetime.timedelta(days=3)
            if datetime.datetime.strptime(
                    str(date),
                    '%Y-%m-%d') < datetime.datetime.today() - timedelta:
                return
            print(colored(date, "red"))
            title_dom = item.css(
                "div[class = 'views-field views-field-title']")
            url = base_url + title_dom.css("a ::attr(href)").extract()[0]
            title = title_dom.css("h2 ::text").extract()[0]
            element = model.Element(title=title, url=url, date=date)
            element.save()

        #Parse for the next page.
        next_dom = response.css(".pager-next a ::attr(href)")
        if len(next_dom) <= 0:
            return
        next_url = base_url + response.css(
            ".pager-next a ::attr(href)").extract()[0]
        yield scrapy.Request(url=next_url, callback=self.parse)
Exemplo n.º 5
0
    def parse(self, response):

        divs = response.css('.small-archive-item')

        for item in divs:
            #Dom contains the information of title.
            title_dom = item.css('.archive-panel')
            #title.
            title = title_dom.css('h4 a ::text').extract()[0]
            url = title_dom.css('h4 a ::attr(href)').extract()[0]

            #No date in the website, so I need to use today as the date, the code here need to change in the future.
            raw_data = item.css('.byline ::text')[2].extract()[3:]
            date = self.formatTime(raw_data)
            timedelta = datetime.timedelta(days=3)
            if datetime.datetime.strptime(
                    str(date),
                    '%Y-%m-%d') < datetime.datetime.today() - timedelta:
                return

            element = model.Element(title=title, url=url, date=date)
            element.save()
            print(colored(date, 'red'))

        next_dom = response.css(
            "div[class = 'col-xs-6 text-right next-posts-link'] a ::attr(href)"
        ).extract()
        if (len(next_dom) <= 0):
            return
        nexturl = next_dom[0]
        yield scrapy.Request(url=nexturl, callback=self.parse)
Exemplo n.º 6
0
    def parse(self, response):
        articles = response.css("article[class='grid-item']")
        print(colored(len(articles), "red"))

        for li in articles:
            title = li.css(
                "h2[class = 'sub-item__title h3'] ::text").extract()[0]
            url = li.css(
                "a[class='grid-item__link'] ::attr(href)").extract()[0]
            #Date can be parsed by the url.
            url_list = url.split('/')
            date = datetime.date(year=int(url_list[-5]),
                                 month=int(url_list[-4]),
                                 day=int(url_list[-3]))
            print(colored(date, "red"))
            timedelta = datetime.timedelta(days=3)
            if datetime.datetime.strptime(
                    str(date),
                    '%Y-%m-%d') < datetime.datetime.today() - timedelta:
                return
            element = model.Element(title=title, url=url, date=date)
            element.save()
        nextURL = response.css(
            "a[class='next page-numbers']::attr(href)").extract()
        if (nextURL):
            yield scrapy.Request(url=nextURL[0], callback=self.parse)
Exemplo n.º 7
0
 def parse2(self , response):
     date_info = response.css('.timestamp ::text').extract()[0]
     date = self.formatTime(date_info)
     url = response.meta['url']
     title = response.meta['title']
     element = model.Element(title=title, date=date, url=url)
     element.save()
     print(colored(date , 'red'))
Exemplo n.º 8
0
 def parse(self, response):
     doms = response.css('.index-posts .with-featured-image')
     for item in doms:
         date_info = item.css('.entry-byline-date ::text').extract()[0]
         date = self.formatTime(date_info)
         title_dom = item.css('.entry-title a')
         title = title_dom.css('::text').extract()[0]
         url = title_dom.css('::attr(href)').extract()[0]
         element = model.Element(title=title, date=date, url=url)
         element.save()
Exemplo n.º 9
0
 def parse2(self, response):
     date = response.css('time').css('::attr(datetime)').extract()[0]
     url = response.meta['url']
     title = response.meta['title']
     date = datetime.datetime.strptime(date[:10], '%Y-%m-%d')
     timedelta = datetime.timedelta(days=3)
     if date < datetime.datetime.today() - timedelta:
         return
     print(colored(date , "red"))
     element = model.Element(title=title, date=date, url=url)
     element.save()
Exemplo n.º 10
0
 def parse2(self, response):
     element = response.css('header').css('div').css('p')
     text = element[0].css('::text').extract()
     date = datetime.datetime.strptime(self.get_date(text[len(text) - 1]),
                                       '%Y-%m-%d')
     timedelta = datetime.timedelta(days=3)
     if date < datetime.datetime.today() - timedelta:
         return
     url = response.meta['href'][0]
     title = response.meta['text'][0]
     element = model.Element(title=title, date=date, url=url)
     element.save()
Exemplo n.º 11
0
 def parse_date(self , response):
     date_info = response.css("li[class = 'datemeta'] ::text").extract()[0]
     date = self.get_date(date_info=date_info)
     timedelta = datetime.timedelta(days=3)
     if datetime.datetime.strptime(str(date),'%Y-%m-%d') < datetime.datetime.today() - timedelta:
         return
     title = response.meta['title']
     url = response.meta['url']
     print(colored(date , "red"))
     print(colored(title , "red"))
     element = model.Element(title=title , url=url , date=date)
     element.save()
Exemplo n.º 12
0
 def parse(self, response):
     articles = response.css('article')
     for item in articles:
         date_info = item.css('time ::attr(datetime)').extract()[0]
         date = datetime.datetime.strptime(date_info, '%Y-%m-%d')
         date = datetime.date(date.year , date.month , date.day)
         title_dom = item.css('header h2 a')
         title = title_dom.css('::text').extract()[0]
         url = title_dom.css('::attr(href)').extract()[0]
         element = model.Element(title=title , date=date , url=url)
         element.save()
         print(colored(date , 'red'))
Exemplo n.º 13
0
 def parse(self, response):
     res = json.loads(response.body)
     items = res["items"]
     for (k, v) in items.items():
         href = v['link']
         href = href[:href.find('?source')]
         title = v['title']
         rawTime = v['date']
         date = self.formatTime(rawTime)
         # 这个网站上时间是乱序的,不过文章数量不多,就每次都爬整站了
         # print(href, title, date)
         element = model.Element(title=title, date=date, url=href)
         element.save()
Exemplo n.º 14
0
 def parse(self, response):
     lis = response.css("ul[class='flex fixedSpaces']").css("li")
     for li in lis:
         href = li.css("a[class='article-element']::attr(href)").extract()
         title = li.css("a[class='article-element']::attr(title)").extract()
         rawTime = li.css("div::text").extract()[0]
         date = self.formatTime(rawTime)
         timedelta = datetime.timedelta(days=3)
         if datetime.datetime.strptime(str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta:
             return
         element = model.Element(title=title[0], date=date, url=href[0])
         element.save()
     nextURL = response.css("a[class='next page-numbers']::attr(href)").extract()
     if (nextURL):
         yield scrapy.Request(url=nextURL[0], callback=self.parse)
Exemplo n.º 15
0
 def parse(self, response):
     print(colored("begin to parse!", 'red'))
     articles=response.css("div[class='sb5-session-detail row']")
     print(colored("Get articles" , "red"))
     for article in articles:
         url=article.css("p[class='sb5-session-title'] a::attr(href)").extract_first()
         print(colored("get url" , "red"))
         title=article.css("p[class='sb5-session-title'] a::text").extract_first()
         print(colored("get title" , "red"))
         timestr=article.css("p[class='sb5-time'] time::text").extract_first()
         strs = timestr.split()
         months = ('january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october',
                   'november', 'december')
         date = datetime.datetime(year=2018, month=(months.index(strs[1].lower()) + 1), day=int(strs[2]))
         print(colored(date,'red'))
         element = model.Element(title=title, date=date, url=url)
         element.save()
Exemplo n.º 16
0
 def parse(self, response):
     href = response.url
     title = response.css(
         "h3[class='post-title entry-title']::text").extract()[0].strip()
     rawTime = response.css("h2[class='date-header']").css(
         "span::text").extract()[0]
     date = self.formatTime(rawTime)
     print('\033[1;35m {} {} {} \033[0m'.format(href, title, date))
     # 爬取整站时关闭,每天更新时将下面两句取消注释
     # if(date != datetime.date.today()):
     #     raise CloseSpider("{name}没有新的更新".format(name=self.name))
     element = model.Element(title=title, date=date, url=href)
     element.save()
     if (self.index + 1 < len(self.urls)):
         self.index += 1
         yield scrapy.Request(url=self.urls[self.index],
                              callback=self.parse)
Exemplo n.º 17
0
    def parse(self, response):

        articles = response.css('article')

        for item in articles:
            #No date in the website, so I need to use today as the date, the code here need to change in the future.
            date = datetime.date(year=2018, month=10, day=1)
            href = item.css('a ::attr(href)').extract()[0]
            title = item.css("a[class = 'h-link'] h2 ::text").extract()[0]
            # This may be useful in the future.
            # subtitle = item.css("a[class = 'h-link'] h3 ::text").extract()[0]

            element = model.Element(title=title, url=href, date=date)
            if element.check_exist() == False:
                element.date = datetime.date.today()
                element.save()
                print(colored(title, 'red'))
Exemplo n.º 18
0
    def parse2(self, response):
        # 获得日期
        dateStr = response.xpath("//h2[@class='date-header']").xpath(
            "text()").extract_first().split('/')
        date = datetime.datetime(year=int(dateStr[2]),
                                 month=int(dateStr[0]),
                                 day=int(dateStr[1]))
        # 接受文章标题和文章超链接数据
        title = response.meta['title']
        url = response.meta['url']

        timedelta = datetime.timedelta(days=3)
        #if datetime.datetime.strptime(str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta:
        #    return
        print(colored(date, 'red'))

        # 保存数据
        element = model.Element(title=title, date=date, url=url)
        element.save()
Exemplo n.º 19
0
 def parse(self, response):
     lis = response.css("main[id='main']").css("article")
     for li in lis:
         href = li.css('h2').css('a::attr(href)').extract()
         title = li.css('h2').css('a::attr(title)').extract()
         rawTime = li.css(
             "span[class='meta-date']::text").extract()[0].strip()
         date = self.formatTime(rawTime)
         timedelta = datetime.timedelta(days=3)
         if datetime.datetime.strptime(
                 str(date),
                 '%Y-%m-%d') < datetime.datetime.today() - timedelta:
             return
         # print(href, title, date)
         element = model.Element(title=title[0], date=date, url=href[0])
         element.save()
     nextURL = response.css(
         "a[class='next page-numbers']::attr(href)").extract()
     if (nextURL):
         yield scrapy.Request(url=nextURL[0], callback=self.parse)
Exemplo n.º 20
0
 def parse(self, response):
     global page_num
     page_num += 1
     for item in response.css('article'):
         if len(item.css('::attr(id)').extract()) > 0:
             href = item.css('h2 a').css('::attr(href)').extract()[0]
             title = item.css('h2 a').css('::text').extract()[0]
             date = item.css('.cb-date').css('time').css(
                 '::attr(datetime)').extract()[0]
             # if datetime.datetime.strptime(date, '%Y-%m-%d') < datetime.datetime.today():
             #     return
             print(colored(date, "red"))
             date = datetime.datetime.strptime(date, '%Y-%m-%d')
             timedelta = datetime.timedelta(days=3)
             if date < datetime.datetime.today() - timedelta:
                 return
             element = model.Element(title=title, date=date, url=href)
             element.save()
     if page_num <= 32:
         next_url = target + str(page_num) + '/'
         yield scrapy.Request(url=next_url, callback=self.parse)
Exemplo n.º 21
0
 def parse(self, response):
     infos = json.loads(response.body)
     status = response.status
     for info in infos:
         href = info['url']
         title = info['title']
         rawTime = info['publish_date']
         date = self.formatTime(rawTime)
         timedelta = datetime.timedelta(days=3)
         if datetime.datetime.strptime(
                 str(date),
                 '%Y-%m-%d') < datetime.datetime.today() - timedelta:
             return
         # print(href, title, date)
         element = model.Element(title=title, date=date, url=href)
         element.save()
     if (status == 200):
         currURL = response.url
         index = int(currURL.split('/')[-2]) + 8
         nextURL = 'https://www.sitepoint.com/janus/api/LatestArticle/8/{s}/All'.format(
             s=index)
         yield scrapy.Request(url=nextURL, callback=self.parse)
Exemplo n.º 22
0
    def parse(self, response):
        base_url = "http://www.uxbooth.com"
        #Firstly get the next page url.
        container = response.css('.article-archive .page-body .wrapper')
        container = container.css(
            '.page-body__inner .page-body__layout .page-body__primary')
        article_container = container.css('.articles')

        articles = article_container.css('article')

        for item in articles:
            url = item.css('a ::attr(href)').extract()[0]
            title = item.css('a h1 ::text').extract()[0].replace('  ',
                                                                 '').replace(
                                                                     '\n', '')
            date_str = item.css(
                '.articles__article-meta ::text').extract()[0].replace(
                    '  ', '').replace('\n', '')
            #Example, 'Amy Grace Wells &bullet; October 2nd, 2018'
            day = 1
            month = 1
            year = 2018
            for item2 in DATE_DICT.keys():
                if date_str.rfind(item2) != -1:
                    date_str = date_str[date_str.rfind(item2):]
                    #Get the month.
                    month = DATE_DICT[item2]
                    #Get the day.
                    day = int(date_str[len(item2):date_str.rfind(',') -
                                       2].replace(' ', ''))
                    #Get the year.
                    year = int(date_str[date_str.rfind(',') + 1:].replace(
                        ' ', ''))
                    break
            #Set the date.
            date = datetime.date(year=year, month=month, day=day)
            print(colored(date, 'red'))
            timedelta = datetime.timedelta(days=3)
            if date < datetime.date.today() - timedelta:
                return
            element = model.Element(title=title, url=url, date=date)
            element.save()

        container = container.css('.pagination')
        #Get the ul.
        ulist = container.css('ul li')
        current_page = ulist.css('.pagination__link--current')

        #Get the index of the next page button.
        list_obj = ulist.css('a').extract()
        current_page_obj = current_page.extract()

        current_page_index = -1

        for index, item in enumerate(list_obj):
            if item == current_page_obj[0]:
                current_page_index = index

        if current_page_index == -1:
            return

        next_page_index = current_page_index + 1
        if (next_page_index > len(list_obj)):
            return
        #Get the url of the next page.
        next_page_url = base_url + ulist.css('a')[next_page_index].css(
            '::attr(href)').extract()[0]
        yield scrapy.Request(url=next_page_url, callback=self.parse)