def hello(request): context = {} maxDate = datetime.datetime.today() oneDay = datetime.timedelta(days=1) minDate = maxDate - oneDay oneHour = datetime.timedelta(hours=1) if 'minDate' in request.GET: minDate = datetime.datetime.strptime(request.GET['minDate'], '%Y-%m-%d') if 'maxDate' in request.GET: maxDate = datetime.datetime.strptime(request.GET['maxDate'], '%Y-%m-%d') minDate -= oneHour maxDate += oneHour #Read the json. db = connection.DB() result = db.select( 'SELECT * FROM link WHERE dat >= ? AND dat <= ?', [minDate.strftime('%Y-%m-%d'), maxDate.strftime('%Y-%m-%d')]) #Delete elements not in the range. elements = [] for item in result: elements.append(model.Element(title=item[1], url=item[2], date=item[3])) elements.sort(key=functools.cmp_to_key(cmp)) minDate += oneHour context['min'] = minDate.strftime('%Y-%m-%d') context['max'] = maxDate.strftime('%Y-%m-%d') context['item_list'] = elements context['len'] = len(elements) return render(request, 'hello.html', context)
def parse(self , response): base_url = "https://www.smashingmagazine.com" body = response.css('main section') res = body.css('nav ul') res2 = res.css('.pagination__next') urlDomLen = len(res2.css('a ::attr(href)')) if (urlDomLen != 1): return nextPageUrl = base_url + res2.css('a ::attr(href)')[0].extract() elements = body.css('.container .row .col article') #Get the date of yesterday, article before yesterday needn`t update. one_day = datetime.timedelta(days=1) yesterday = datetime.datetime.today() - one_day for item in elements: #Get the dom stores the title and the url. title_dom = item.css('h1 a') url = base_url + title_dom.css('::attr(href)').extract()[0] title = title_dom.css('::text').extract()[0] #Get the dom stores the date. date_dom = item.css('.article--post__content ') date = datetime.datetime.strptime(date_dom.css('time ::attr(datetime)').extract()[0] , '%Y-%m-%d') timedelta = datetime.timedelta(days=3) if date < datetime.datetime.today() - timedelta: return element = model.Element(title=title , date=date , url=url) print([title , date , url]) element.save() #Call for the next page yield scrapy.Request(url = nextPageUrl , callback = self.parse)
def parse(self, response): element = response.css('li').css('a') for item in element: href = item.css('::attr(href)').extract() text = item.css('::text').extract() #Parse the date. dateList = str(href).split('/') #Maybe incorrect. year = dateList[-4] month = dateList[-3] day = 1 date = datetime.date(int(year), int(month), day) timedelta = datetime.timedelta(days=3) if datetime.datetime.strptime( str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta: return element = model.Element(title=str(text[0][2:]), date=date, url=str(href[0])) exist = element.check_exist() #If this element doesn`t exist in the database, it means this is a new element. # if exist == False: # element.set_date(datetime.date.today()) element.save()
def parse(self, response): base_url = "http://thedesigninspi" doms = response.css(".views-row") for item in doms: date_info = item.css( "div[class = 'views-field views-field-created'] span ::text" ).extract()[0].split('|')[1] date = self.parse_date(date_info) timedelta = datetime.timedelta(days=3) if datetime.datetime.strptime( str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta: return print(colored(date, "red")) title_dom = item.css( "div[class = 'views-field views-field-title']") url = base_url + title_dom.css("a ::attr(href)").extract()[0] title = title_dom.css("h2 ::text").extract()[0] element = model.Element(title=title, url=url, date=date) element.save() #Parse for the next page. next_dom = response.css(".pager-next a ::attr(href)") if len(next_dom) <= 0: return next_url = base_url + response.css( ".pager-next a ::attr(href)").extract()[0] yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): divs = response.css('.small-archive-item') for item in divs: #Dom contains the information of title. title_dom = item.css('.archive-panel') #title. title = title_dom.css('h4 a ::text').extract()[0] url = title_dom.css('h4 a ::attr(href)').extract()[0] #No date in the website, so I need to use today as the date, the code here need to change in the future. raw_data = item.css('.byline ::text')[2].extract()[3:] date = self.formatTime(raw_data) timedelta = datetime.timedelta(days=3) if datetime.datetime.strptime( str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta: return element = model.Element(title=title, url=url, date=date) element.save() print(colored(date, 'red')) next_dom = response.css( "div[class = 'col-xs-6 text-right next-posts-link'] a ::attr(href)" ).extract() if (len(next_dom) <= 0): return nexturl = next_dom[0] yield scrapy.Request(url=nexturl, callback=self.parse)
def parse(self, response): articles = response.css("article[class='grid-item']") print(colored(len(articles), "red")) for li in articles: title = li.css( "h2[class = 'sub-item__title h3'] ::text").extract()[0] url = li.css( "a[class='grid-item__link'] ::attr(href)").extract()[0] #Date can be parsed by the url. url_list = url.split('/') date = datetime.date(year=int(url_list[-5]), month=int(url_list[-4]), day=int(url_list[-3])) print(colored(date, "red")) timedelta = datetime.timedelta(days=3) if datetime.datetime.strptime( str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta: return element = model.Element(title=title, url=url, date=date) element.save() nextURL = response.css( "a[class='next page-numbers']::attr(href)").extract() if (nextURL): yield scrapy.Request(url=nextURL[0], callback=self.parse)
def parse2(self , response): date_info = response.css('.timestamp ::text').extract()[0] date = self.formatTime(date_info) url = response.meta['url'] title = response.meta['title'] element = model.Element(title=title, date=date, url=url) element.save() print(colored(date , 'red'))
def parse(self, response): doms = response.css('.index-posts .with-featured-image') for item in doms: date_info = item.css('.entry-byline-date ::text').extract()[0] date = self.formatTime(date_info) title_dom = item.css('.entry-title a') title = title_dom.css('::text').extract()[0] url = title_dom.css('::attr(href)').extract()[0] element = model.Element(title=title, date=date, url=url) element.save()
def parse2(self, response): date = response.css('time').css('::attr(datetime)').extract()[0] url = response.meta['url'] title = response.meta['title'] date = datetime.datetime.strptime(date[:10], '%Y-%m-%d') timedelta = datetime.timedelta(days=3) if date < datetime.datetime.today() - timedelta: return print(colored(date , "red")) element = model.Element(title=title, date=date, url=url) element.save()
def parse2(self, response): element = response.css('header').css('div').css('p') text = element[0].css('::text').extract() date = datetime.datetime.strptime(self.get_date(text[len(text) - 1]), '%Y-%m-%d') timedelta = datetime.timedelta(days=3) if date < datetime.datetime.today() - timedelta: return url = response.meta['href'][0] title = response.meta['text'][0] element = model.Element(title=title, date=date, url=url) element.save()
def parse_date(self , response): date_info = response.css("li[class = 'datemeta'] ::text").extract()[0] date = self.get_date(date_info=date_info) timedelta = datetime.timedelta(days=3) if datetime.datetime.strptime(str(date),'%Y-%m-%d') < datetime.datetime.today() - timedelta: return title = response.meta['title'] url = response.meta['url'] print(colored(date , "red")) print(colored(title , "red")) element = model.Element(title=title , url=url , date=date) element.save()
def parse(self, response): articles = response.css('article') for item in articles: date_info = item.css('time ::attr(datetime)').extract()[0] date = datetime.datetime.strptime(date_info, '%Y-%m-%d') date = datetime.date(date.year , date.month , date.day) title_dom = item.css('header h2 a') title = title_dom.css('::text').extract()[0] url = title_dom.css('::attr(href)').extract()[0] element = model.Element(title=title , date=date , url=url) element.save() print(colored(date , 'red'))
def parse(self, response): res = json.loads(response.body) items = res["items"] for (k, v) in items.items(): href = v['link'] href = href[:href.find('?source')] title = v['title'] rawTime = v['date'] date = self.formatTime(rawTime) # 这个网站上时间是乱序的,不过文章数量不多,就每次都爬整站了 # print(href, title, date) element = model.Element(title=title, date=date, url=href) element.save()
def parse(self, response): lis = response.css("ul[class='flex fixedSpaces']").css("li") for li in lis: href = li.css("a[class='article-element']::attr(href)").extract() title = li.css("a[class='article-element']::attr(title)").extract() rawTime = li.css("div::text").extract()[0] date = self.formatTime(rawTime) timedelta = datetime.timedelta(days=3) if datetime.datetime.strptime(str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta: return element = model.Element(title=title[0], date=date, url=href[0]) element.save() nextURL = response.css("a[class='next page-numbers']::attr(href)").extract() if (nextURL): yield scrapy.Request(url=nextURL[0], callback=self.parse)
def parse(self, response): print(colored("begin to parse!", 'red')) articles=response.css("div[class='sb5-session-detail row']") print(colored("Get articles" , "red")) for article in articles: url=article.css("p[class='sb5-session-title'] a::attr(href)").extract_first() print(colored("get url" , "red")) title=article.css("p[class='sb5-session-title'] a::text").extract_first() print(colored("get title" , "red")) timestr=article.css("p[class='sb5-time'] time::text").extract_first() strs = timestr.split() months = ('january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december') date = datetime.datetime(year=2018, month=(months.index(strs[1].lower()) + 1), day=int(strs[2])) print(colored(date,'red')) element = model.Element(title=title, date=date, url=url) element.save()
def parse(self, response): href = response.url title = response.css( "h3[class='post-title entry-title']::text").extract()[0].strip() rawTime = response.css("h2[class='date-header']").css( "span::text").extract()[0] date = self.formatTime(rawTime) print('\033[1;35m {} {} {} \033[0m'.format(href, title, date)) # 爬取整站时关闭,每天更新时将下面两句取消注释 # if(date != datetime.date.today()): # raise CloseSpider("{name}没有新的更新".format(name=self.name)) element = model.Element(title=title, date=date, url=href) element.save() if (self.index + 1 < len(self.urls)): self.index += 1 yield scrapy.Request(url=self.urls[self.index], callback=self.parse)
def parse(self, response): articles = response.css('article') for item in articles: #No date in the website, so I need to use today as the date, the code here need to change in the future. date = datetime.date(year=2018, month=10, day=1) href = item.css('a ::attr(href)').extract()[0] title = item.css("a[class = 'h-link'] h2 ::text").extract()[0] # This may be useful in the future. # subtitle = item.css("a[class = 'h-link'] h3 ::text").extract()[0] element = model.Element(title=title, url=href, date=date) if element.check_exist() == False: element.date = datetime.date.today() element.save() print(colored(title, 'red'))
def parse2(self, response): # 获得日期 dateStr = response.xpath("//h2[@class='date-header']").xpath( "text()").extract_first().split('/') date = datetime.datetime(year=int(dateStr[2]), month=int(dateStr[0]), day=int(dateStr[1])) # 接受文章标题和文章超链接数据 title = response.meta['title'] url = response.meta['url'] timedelta = datetime.timedelta(days=3) #if datetime.datetime.strptime(str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta: # return print(colored(date, 'red')) # 保存数据 element = model.Element(title=title, date=date, url=url) element.save()
def parse(self, response): lis = response.css("main[id='main']").css("article") for li in lis: href = li.css('h2').css('a::attr(href)').extract() title = li.css('h2').css('a::attr(title)').extract() rawTime = li.css( "span[class='meta-date']::text").extract()[0].strip() date = self.formatTime(rawTime) timedelta = datetime.timedelta(days=3) if datetime.datetime.strptime( str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta: return # print(href, title, date) element = model.Element(title=title[0], date=date, url=href[0]) element.save() nextURL = response.css( "a[class='next page-numbers']::attr(href)").extract() if (nextURL): yield scrapy.Request(url=nextURL[0], callback=self.parse)
def parse(self, response): global page_num page_num += 1 for item in response.css('article'): if len(item.css('::attr(id)').extract()) > 0: href = item.css('h2 a').css('::attr(href)').extract()[0] title = item.css('h2 a').css('::text').extract()[0] date = item.css('.cb-date').css('time').css( '::attr(datetime)').extract()[0] # if datetime.datetime.strptime(date, '%Y-%m-%d') < datetime.datetime.today(): # return print(colored(date, "red")) date = datetime.datetime.strptime(date, '%Y-%m-%d') timedelta = datetime.timedelta(days=3) if date < datetime.datetime.today() - timedelta: return element = model.Element(title=title, date=date, url=href) element.save() if page_num <= 32: next_url = target + str(page_num) + '/' yield scrapy.Request(url=next_url, callback=self.parse)
def parse(self, response): infos = json.loads(response.body) status = response.status for info in infos: href = info['url'] title = info['title'] rawTime = info['publish_date'] date = self.formatTime(rawTime) timedelta = datetime.timedelta(days=3) if datetime.datetime.strptime( str(date), '%Y-%m-%d') < datetime.datetime.today() - timedelta: return # print(href, title, date) element = model.Element(title=title, date=date, url=href) element.save() if (status == 200): currURL = response.url index = int(currURL.split('/')[-2]) + 8 nextURL = 'https://www.sitepoint.com/janus/api/LatestArticle/8/{s}/All'.format( s=index) yield scrapy.Request(url=nextURL, callback=self.parse)
def parse(self, response): base_url = "http://www.uxbooth.com" #Firstly get the next page url. container = response.css('.article-archive .page-body .wrapper') container = container.css( '.page-body__inner .page-body__layout .page-body__primary') article_container = container.css('.articles') articles = article_container.css('article') for item in articles: url = item.css('a ::attr(href)').extract()[0] title = item.css('a h1 ::text').extract()[0].replace(' ', '').replace( '\n', '') date_str = item.css( '.articles__article-meta ::text').extract()[0].replace( ' ', '').replace('\n', '') #Example, 'Amy Grace Wells • October 2nd, 2018' day = 1 month = 1 year = 2018 for item2 in DATE_DICT.keys(): if date_str.rfind(item2) != -1: date_str = date_str[date_str.rfind(item2):] #Get the month. month = DATE_DICT[item2] #Get the day. day = int(date_str[len(item2):date_str.rfind(',') - 2].replace(' ', '')) #Get the year. year = int(date_str[date_str.rfind(',') + 1:].replace( ' ', '')) break #Set the date. date = datetime.date(year=year, month=month, day=day) print(colored(date, 'red')) timedelta = datetime.timedelta(days=3) if date < datetime.date.today() - timedelta: return element = model.Element(title=title, url=url, date=date) element.save() container = container.css('.pagination') #Get the ul. ulist = container.css('ul li') current_page = ulist.css('.pagination__link--current') #Get the index of the next page button. list_obj = ulist.css('a').extract() current_page_obj = current_page.extract() current_page_index = -1 for index, item in enumerate(list_obj): if item == current_page_obj[0]: current_page_index = index if current_page_index == -1: return next_page_index = current_page_index + 1 if (next_page_index > len(list_obj)): return #Get the url of the next page. next_page_url = base_url + ulist.css('a')[next_page_index].css( '::attr(href)').extract()[0] yield scrapy.Request(url=next_page_url, callback=self.parse)