def test_function(self): this1 = pytime.today() == datetime.date.today() self.assertTrue(this1) this2 = pytime.today(2014) == datetime.date.today().replace(year=2014) self.assertTrue(this2) this3 = pytime.tomorrow() == datetime.date.today() + datetime.timedelta(days=1) self.assertTrue(this3) this4 = pytime.tomorrow('2015-5-19') == datetime.date(2015, 5, 20) self.assertTrue(this4) this5 = pytime.yesterday() == datetime.date.today() - datetime.timedelta(days=1) self.assertTrue(this5) this6 = pytime.yesterday('2015-5-29') == datetime.date(2015, 5, 28) self.assertTrue(this6)
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析 while response: post_nodes = response.css( "#dnn_ctr59828_ArticleList__ctl0_ArtDataList a::attr(href)" ).extract() news_time = response.css( "#dnn_ctr59828_ArticleList__ctl0_ArtDataList__ctl0_Label6::text" ).extract_first() if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS): print(news_time + "\n") return for post_node in post_nodes: yield Request(url=parse.urljoin(response.url, post_node), callback=self.parse_detail) self.browser.find_element_by_css_selector( "#dnn_ctr59828_ArticleList__ctl0_lbtnNext").click() selector = Selector(text=self.browser.page_source) page_num = selector.css( "#dnn_ctr59828_ArticleList__ctl0_plPageNum::text" ).extract_first() print("page is " + page_num) response = HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8") return
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析 if response.url == "http://www.jwc.shu.edu.cn/index/tzgg.htm": self.tag = "通知公告" elif response.url == "http://www.jwc.shu.edu.cn/index/xw.htm": self.tag = "新闻" post_nodes = response.css( "#dnn_ctr43516_ArticleList__ctl0_ArtDataList__ctl1_titleLink1::attr(href)" ).extract() news_time = response.css( "dnn_ctr43516_ArticleList__ctl0_ArtDataList__ctl1_Label6::text" ).extract_first() if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS): print(news_time) return for post_node in post_nodes: yield Request(url=parse.urljoin(response.url, post_node), callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css( "a.Next:nth-child(3)::attr(href)").extract_first("") if next_url: yield Request(url=parse.urljoin(response.url, next_url), meta={"tag": self.tag}, callback=self.parse)
def main(): config = get_config() flags = sys.argv pull_latest = False #get flags for flag in flags: if flag == '--latest': pull_latest = True query = '' #if we are only pulling the latest data find out when we ran the program last if pull_latest: last_run = get_last_run_time(config['last_run']) query = 'after:{}'.format(last_run) #try to find data try: pull_gmail_data(query) config['last_run'] = str(pytime.today()) save_config(config) except NoMessagesFoundException as e: print(e) #read an html file now html = HtmlReader.HtmlReader() events = html.read_all('./htmlFilesv2/*.html') write_to_csv(events)
def test_function(self): this1 = pytime.today() == datetime.date.today() self.assertTrue(this1) this2 = pytime.today(2014) == datetime.date.today().replace(year=2014) self.assertTrue(this2) this3 = pytime.tomorrow() == datetime.date.today() + datetime.timedelta(days=1) self.assertTrue(this3) this4 = pytime.tomorrow('2015-5-19') == datetime.date(2015, 5, 20) self.assertTrue(this4) this5 = pytime.yesterday() == datetime.date.today() - datetime.timedelta(days=1) self.assertTrue(this5) this6 = pytime.yesterday('2015-5-29') == datetime.date(2015, 5, 28) self.assertTrue(this6) this7 = pytime.yesterday(1432310400 + gmt8offset) == datetime.datetime(2015, 5, 22) self.assertTrue(this7) this8 = pytime.tomorrow(1432310400 + gmt8offset) == datetime.datetime(2015, 5, 24) self.assertTrue(this8)
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析 post_nodes = response.css(".views-table > tbody:nth-child(1) tr") for post_node in post_nodes: create_date = post_node.css( ".views-field-created::text").extract_first().strip() create_date = datetime.datetime.strptime(create_date, "%y-%m-%d") post_node_url = post_node.css("a::attr(href)").extract_first() if pytime.count(pytime.today(), create_date) < datetime.timedelta(TIME_DELTA_DAYS): url = parse.urljoin(response.url, post_node_url) delta = pytime.count(pytime.today(), create_date) yield Request(url=parse.urljoin(response.url, post_node_url), meta={"create_date": create_date}, callback=self.parse_detail, dont_filter=True) else: break
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析//*[@id="dnn_dnnBREADCRUMB_lblBreadCrumb"]/a[2] tag = response.css("#dnn_dnnBREADCRUMB_lblBreadCrumb > a:nth-child(2)::text").extract_first() post_nodes = response.css( "#dnn_ctr1053_ArticleList_ctl00_lstArticles > tbody:nth-child(1) a::attr(href)").extract() news_time = response.css( "#dnn_ctr1053_ArticleList_ctl00_lstArticles_ctl00_lblPublishDate::text").extract_first() if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS): print(news_time) return for post_node in post_nodes: yield Request(url=parse.urljoin(response.url, post_node), meta ={"tag": tag}, callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css("a.Next:nth-child(3)::attr(href)").extract_first("") if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse(self, response): # 解析列表页中的所有文章url并交给scrapy下载后并进行解析 post_nodes = response.css("#dnn_ctr59825_ArticleList__ctl0_ArtDataList a::attr(href)").extract() news_time = response.css("#dnn_ctr59825_ArticleList__ctl0_ArtDataList__ctl0_Label6::text") .extract_first() if pytime.count(pytime.today(), news_time) > datetime.timedelta(TIME_DELTA_DAYS): print(news_time+"\n") return tag = response.css("#dnn_dnnBREADCRUMB_lblBreadCrumb > a::text") if "tabid=31641" in response.url: tag = '学工新闻' for post_node in post_nodes: yield Request(url=parse.urljoin(response.url, post_node), meta={"tag",tag}, callback=self.parse_detail) # 提取下一页并交给scrapy进行下载 next_url = response.css("#dnn_ctr59825_ArticleList__ctl0_lbtnNext::attr(href)").extract_first("") if next_url: yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def process_item(self, item, spider): if pytime.count( pytime.today(), item['create_date']) < datetime.timedelta(TIME_DELTA_DAYS): return item