def parse(self, response): message_list = response.xpath('//ul[@class="flfg_03"]/li') for message in message_list: date = "".join(message.xpath('span/text()').extract()) title = "".join(message.xpath('a/text()').extract()).replace("· ", "") href = "".join(message.xpath('a/@href').extract()) try: date = datetime.datetime.strptime(str(date).replace('-', '-'), '%Y-%m-%d') except Exception as e: date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if str("www.hainan.gov.cn") in href: result = session.query(NewsItemInfo).filter_by(url=href, web_id=23).count() if result: # print("{} 存在".format(href)) pass else: if str(href) == "http://www.hainan.gov.cn/": pass else: yield scrapy.Request(url=href, callback=self.get_detail, meta={"date": date, "title": title, "laiyuan": response.url}) else: url = response.url + href # .replace("./", "") result = session.query(NewsItemInfo).filter_by(url=url, web_id=23).count() if result: # print("{} 存在".format(url)) pass else: if str(url) == "http://www.hainan.gov.cn/": pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={"date": date, "title": title, "laiyuan": response.url})
def parse(self, response): # print(response.text) message_list = response.xpath('//*[@id="con"]/tr') for message in message_list: title = "".join(message.xpath('td[2]/a/text()').extract()) href = "".join(message.xpath('td[2]/a/@href').extract()) date = "".join(message.xpath('td[3]/text()').extract()) date = date.replace(".", "-") # 防止空白行 只有提取到了标题的内容 才进行数据提取 if title != "": try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print(title, url, date) # 新闻发布会的数据内容不一样 if "http://www.mlr.gov.cn" in href: url = href.replace("index.htm", "") # print(url) result = session.query(NewsItemInfo).filter_by( url=url, web_id=8).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail_fbh, meta={ "date": date, "title": title, "laiyuan": response.url }) else: url = response.url + href result = session.query(NewsItemInfo).filter_by( url=url, web_id=8).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={ "date": date, "title": title, "laiyuan": response.url })
def parse(self, response): message_list = response.xpath('//ul[@class="nr_neirong"]/li') # print(len(message_list)) for message in message_list: title = "".join(message.xpath('span/a/text()').extract()) href = "".join(message.xpath('span/a/@href').extract()) date = "".join(message.xpath('font/text()').extract()) if "http" in href: url = href else: url = response.url + href # print(title, url, date) try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if ".pdf" not in str(url[:-5]).lower(): result = session.query(NewsItemInfo).filter_by( url=url, web_id=57).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={ "title": title, "date": date, "laiyuan": response.url }) else: result = session.query(NewsItemInfo).filter_by( url=url, web_id=57).count() if result: # print("{} 存在".format(url)) pass else: item = FagaiweiItem() item["url"] = url item["pub_time"] = date item["title"] = title item["content"] = "可能是图片或表格 打开原网站查看" item["webname"] = "中国期货业协会" item["web_id"] = 57 item["keyword"] = keyword.get_keyword(item["content"]) yield item
def parse(self, response): message_list = response.xpath('//ul[@class="list_02 clearfix"]/li') # print(len(message_list)) for message in message_list: date = "".join(message.xpath('font/text()').extract()) # print(date) title = "".join(message.xpath('a/text()').extract()) # print(title) href = "".join(message.xpath('a/@href').extract()) # print(href) try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if "www.xinhuanet.com" in href: result = session.query(NewsItemInfo).filter_by( url=href, web_id=2).count() if result: # print("{} 存在".format(href)) pass else: yield scrapy.Request(url=href, callback=self.get_detail, meta={ "date": date, "title": title }) elif href[-3:] == ".pdf": pass else: url = response.url + href[2:] result = session.query(NewsItemInfo).filter_by( url=url, web_id=2).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={ "date": date, "title": title, "laiyuan": response.url })
def parse(self, response): # print(response.text) message_list = response.xpath('//ul[@class="conList_ul"]/li|\ //ul[@class="govpushinfo150203"]/li') # print(len(message_list)) for message in message_list: title = "".join(message.xpath('a/text()').extract()) href = "".join(message.xpath('a/@href').extract()) date = "".join(message.xpath('span/text()').extract()) # print(title, href, date) try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if "http://fangtan.customs.gov.cn" in href: url = href else: url = "http://www.customs.gov.cn" + href result = session.query(NewsItemInfo).filter_by(url=url, web_id=11).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={ "title": title, "date": date, "laiyuan": response.url })
def parse(self, response): # 获取页面详情页的url url = response.url orurl = 'http://www.xm.gov.cn/' contens_urls = response.xpath( "//div[@class='gl_list1']//li/a/@href").getall() name = response.xpath("//li[@class='on']/a/text()")[-1].get() for contens_url in contens_urls: if contens_url.startswith('./'): contens_url = contens_url.replace('./', '') contens_url = url + contens_url elif contens_url.startswith('../'): contens_url = contens_url.replace('../', '') contens_url = orurl + contens_url result = session.query(NewsItemInfo).filter_by(url=contens_url, web_id=25).count() if result: # print("{} 存在".format(contens_url)) pass else: yield scrapy.Request(url=contens_url, callback=self.parse_page, meta={ "url": url, "name": name })
def process_file(self, response): # 处理详情页面是文件如公司公告 # print('this is file !') item = FagaiweiItem() for tmp in response.xpath('//*[@class="content"]/div[@ref]/div[2]'): item['web_id'] = 51 item['url'] = tmp.xpath('./div[1]/a/@href').extract_first() item['title'] = tmp.xpath('./div[1]/a/text()').extract_first( default='') item['web'] = response.meta.get('web') item['keyword'] = '' item['webname'] = '阿斯达克新闻' item_time = tmp.xpath( './div[@class="newstime4"]/text()').extract_first(default='') item['pub_time'] = ' '.join(item_time.split()[1:]).replace( '/', '-') item['content'] = '这是一个文件,查看原文链接进行打开!!' result = session.query(NewsItemInfo).filter_by(url=item['url'], web_id=51).count() if result: # print("{} 存在".format(item['url'])) pass else: yield item
def parse(self, response): item = FagaiweiItem() urls = response.xpath("//span[@class='tit']/a/@href").getall() # print(urls) titles = response.xpath("//span[@class='tit']/a/text()").getall() times = response.xpath("//span[@class='time']/text()").getall() dabao = zip(urls, titles, times) for url, title, time1 in dabao: filename = re.findall(r'=(\d+)', url)[0] url2 = 'http://php.cnstock.com/news_new/index.php/api/fileview?ID=' + filename + '&db=txt' if url2[-4:] == '=txt': # print("==================================\n{}".format(durl)) result = session.query(NewsItemInfo).filter_by( url=url2, web_id=67).count() if result: # print("TXT 文件地址: {} 存在".format(url2)) pass else: content = txt.main(url=url2) item['content'] = content item['web_id'] = 67 item['title'] = title time = time1.replace('(', '').replace(')', '') item['pub_time'] = time item['webname'] = '中国证券网信息披露平台' item['web'] = response.url item['url'] = url2 item["keyword"] = keyword.get_keyword(item["content"]) yield item
def parse(self, response): doc = pq(response.text) keyword_list = doc('#topwords li a').text().split() url_list = doc('.news-list li') for content in url_list.items(): webname = content('.account').text() web = response.url url = content('h3 a').attr('href') result = session.query(NewsItemInfo).filter_by(url=url, web_id=13).count() if result: # print("{} 存在".format(url)) pass else: title = content('h3').text() timestamp = content('.s-p').attr('t') pub_time = datetime.datetime.fromtimestamp( int(timestamp)).strftime('%Y-%m-%d %H:%M:%S') yield scrapy.Request(url, callback=self.process_detail, meta={ 'webname': webname, 'web': web, 'title': title, 'pub_time': pub_time })
def parse(self, response): message_list = response.xpath( '//div[@class="wscn-tabs__content"]/div/div') # print(len(message_list)) for message in message_list: # date = "".join(message.xpath('span/a/text()|span/text()').extract()) # date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) title = "".join( message.xpath('div/div/a[1]/text()').extract()).replace( " ", "").replace("\n", "") href = "".join(message.xpath('div/div/a[1]/@href').extract()) # print(title, href) if "http" in href: url = href else: url = "https://wallstreetcn.com" + href result = session.query(NewsItemInfo).filter_by(url=url, web_id=46).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={ "title": title, "laiyuan": response.url })
def url_fagaiwei(response): data_list = {} message_list = response.xpath('//ul[@class="list_02 clearfix"]/li') for message in message_list: date = "".join(message.xpath('font/text()').extract()) title = "".join(message.xpath('a/text()').extract()) href = "".join(message.xpath('a/@href').extract()) if title != "": try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if "www.xinhuanet.com" in href: url = href elif href.endswith("pdf"): pass elif "http" in href: url = href else: url = response.url + href result = session.query(NewsItemInfo).filter_by(url=url, web_id=2).count() if result: # print("{} 存在".format(url)) pass else: data_list['date'] = date data_list['url'] = url data_list['title'] = title yield data_list
def parse(self, response): message_list = response.xpath( '//table[@class="sv_yh_14_30"]/tr/td/table/tr') # print(len(message_list)) for message in message_list: title = "".join(message.xpath('td[2]/a/text()').extract()) href = "".join(message.xpath('td[2]/a/@href').extract()) date = "".join(message.xpath('td[3]/text()').extract()) # print(title, href, date) date = date.replace('[', '').replace(']', '') try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) if href != "": url = response.url.replace("index.html", "") + href result = session.query(NewsItemInfo).filter_by( url=url, web_id=9).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={ "title": title, "date": date, "laiyuan": response.url })
def parse(self, response): pub_title = '大公报' data_tiitle = ''.join(list(response.xpath("//div[@class='pannel_inner01']/div//text()").getall())) \ .replace('/n', '') web2 = 'http://news.takungpao.com.hk/paper/{}.html'.format( time.strftime("%Y%m%d", time.localtime())) url2s = response.xpath("//a[@class ='bluelink']/text()").getall() for url2 in url2s: item = FagaiweiItem() param = re.search(r'第(\w+)版', url2).group(1) url = web2 + '?' + param result = session.query(NewsItemInfo).filter_by(url=url, web_id=41).count() if result: # print("PDF 文件地址: {} 存在".format(url)) pass else: item['url'] = url item['title'] = pub_title + data_tiitle + param item['content'] = '该页面为电子版报纸请点原链接查看' item['web'] = response.url item['webname'] = pub_title item['pub_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 41 yield item
def parse(self, response): pub_title = '中国上市公司' title2 = response.xpath("//span[@class='title']/text()").get() webname = pub_title + title2 urls = response.xpath( "//div[@class='list_data']//li/a/@onclick").getall() urla = response.url data = response.xpath("//span[@class='date']/text()").get() if data.startswith('2018'): data = data else: data = '2018-' + data for url in urls: url = 'http://www.cnlist.com' + url.replace("OpenDetail('", '') \ .replace(',', '?id=').replace("');", '').replace(' ', '').replace("'", "") result = session.query(NewsItemInfo).filter_by(url=url, web_id=64).count() if result: # print("PDF 文件地址: {} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.parse_page, meta={ 'url': urla, 'webname': webname, 'data': data })
def parse_tz(self, response): message_list = response.xpath('//tbody/tr') # print(len(message_list)) for message in message_list: title = "".join(message.xpath('td/a/text()').extract()) href = "".join(message.xpath('td/a/@href').extract()) date = "".join(message.xpath('td[2]/text()').extract()) try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print(title, href, date) url = href if ".pdf" not in str(url[:-5]).lower(): result = session.query(NewsItemInfo).filter_by( url=url, web_id=57).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={ "title": title, "date": date, "laiyuan": response.url })
def parse(self, response): message_list = response.xpath('//div[@class="lie_main_m"]/ul/li|//div[@class="lie_main_m"]/li') for message in message_list: title = "".join(message.xpath('a/text()').extract()) # keyword = "".join(message.xpath('span/text()').extract()) href = "".join(message.xpath('a/@href').extract()) date = "".join(message.xpath('a/span/text()').extract()) # print(href) try: date = datetime.datetime.strptime(str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) url = response.url + href # print(title, date, url) result = session.query(NewsItemInfo).filter_by(url=url, web_id=20).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={"date": date, "title": title.replace("\r", "").replace("\n", "").replace("\t", ""), "laiyuan": response.url})
def parse(self, response): message_list = response.xpath('//div[@class="inner"]/div[2]/ul/li') # print(len(message_list)) for message in message_list: title = "".join(message.xpath('a/text()').extract()) href = "".join(message.xpath('a/@href').extract()) date = "".join(message.xpath('span/text()').extract()) try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) url = "http://www.forestry.gov.cn" + href # print(title, url, date) result = session.query(NewsItemInfo).filter_by(url=url, web_id=15).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={ "title": title, "date": date, "laiyuan": response.url })
def parse(self, response): json_text = response.text json_str = json.loads(json_text, encoding='utf8') urls = jsonpath.jsonpath(json_str, '$..url') pub_times = jsonpath.jsonpath(json_str, '$..dateTime') titles = jsonpath.jsonpath(json_str, '$..title') contents = jsonpath.jsonpath(json_str, '$..description') dabao = zip(urls, titles, pub_times, contents) for url, title, pub_time, content in dabao: result = session.query(NewsItemInfo).filter_by(url=url, web_id=44).count() if result: # print("URL文件地址: {} 存在".format(url)) pass else: item = FagaiweiItem() item['webname'] = '央视网' item['web'] = 'http://news.cctv.com/' item['url'] = url item['pub_time'] = pub_time item['content'] = content item['keyword'] = '' item['title'] = title item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['web_id'] = 44 yield item
def parse(self, response): message_list = response.xpath('//ul/li') # print(len(message_list)) for message in message_list: href = "".join(message.xpath('a/@href').extract()) title = "".join(message.xpath('a/text()').extract()) date = "".join(message.xpath('span/text()').extract()) if date != "": if "http" in href.lower(): url = href else: url = "http://www.cea.gov.cn" + href date = date.replace('[', '').replace(']', '') # print(date) try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d %H:%M:%S') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result = session.query(NewsItemInfo).filter_by( url=url, web_id=7).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.get_detail, meta={ "date": date, "title": title, "laiyuan": response.url })
def parse_2(self, response): message_list = response.xpath( '//div[@class="show_body clearfix"]/div[1]/ul/li') for message in message_list: title = "".join(message.xpath('a/text()').extract()) href = "".join(message.xpath('a/@href').extract()) date = "".join(message.xpath('span/text()').extract()) date = date.replace("年", "-").replace("月", "-").replace("日", "") # print(date) try: date = datetime.datetime.strptime( str(date).replace('/', '-'), '%Y-%m-%d %H:%M') # print(date) except Exception as e: # print(e) date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # print(title, href, date) result = session.query(NewsItemInfo).filter_by(url=href, web_id=49).count() if result: # print("{} 存在".format(href)) pass else: yield scrapy.Request(url=href, callback=self.get_detail_2, meta={ "title": title, "date": date, "laiyuan": response.url })
def parse(self, response): urlsy = [] response = response.text json_str = json.loads(response, encoding='utf-8') urls = jsonpath.jsonpath(json_str, '$..url') nodeIds = jsonpath.jsonpath(json_str, '$..nodeId') titles = jsonpath.jsonpath(json_str, '$..title') datas = jsonpath.jsonpath(json_str, '$..date') dabaos = zip(urls, nodeIds, titles, datas) for url, node, titles, datas in dabaos: # print(url) node = int(node) # 把不需要的nodeID写进去就可以过滤他那一类的新闻 not_node = [1016, 368583, 174585] if url not in urlsy: if node not in not_node: urlsy.append(url) result = session.query(NewsItemInfo).filter_by( url=url, web_id=33).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.parse_page, meta={ 'title': titles, 'data': datas }) else: pass else: pass
def parse_js(self, response): url_list = re.findall(r'"url"\s?:\s?"(.*?)"', response.text) for url in url_list: result = session.query(NewsItemInfo).filter_by(url=url, web_id=36).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url, callback=self.process_detail, meta={'web': response.url})
def parse(self, response): url_list = response.xpath('//ul[@data-client="scroll"]/li/a/@href | //ul[@class="list_009"]/li/a/@href ').extract() for url in url_list: result = session.query(NewsItemInfo).filter_by(url=url, web_id=42).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url,callback=self.process_detail,meta={'web':response.url})
def parse(self, response): url_list = response.xpath('//div[@class="newslist"]/ul/li/div[1]/a/@href').extract() for url in url_list: new_url = 'http://www.acbgg.com' + url result = session.query(NewsItemInfo).filter_by(url=new_url, web_id=83).count() if result: # print("{} 存在".format(new_url)) pass else: yield scrapy.Request(new_url, callback=self.process_detail, meta={'web': response.url})
def parse(self, response): url_list = response.xpath('//*[@id="ent0"]/li//div[@class="news_title"]/em/a/@href').extract() # print(url_list) for url in url_list: result = session.query(NewsItemInfo).filter_by(url='https:' + url, web_id=36).count() if result: # print("{} 存在".format('https:' + url)) pass else: yield scrapy.Request('https:' + url, callback=self.process_detail, meta={'web': response.url})
def parse(self, response): url_list = response.xpath('//*[@id="mainlist"]/ul/li/p/a/@href|//ul[@id="idData"]/li/p[1]/a/@href').extract() for url in url_list: if '1' in url: result = session.query(NewsItemInfo).filter_by(url=url, web_id=61).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url, callback=self.process_detail, meta={'web': response.url})
def parse(self, response): # 获取文章链接 url_list = response.xpath('//div[@class="ct_b_l_list"]//a[@class="ct_b_l_l_tb_tltie"]/@href').extract() for url in url_list: result = session.query(NewsItemInfo).filter_by(url=url, web_id=60).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url, callback=self.process_detail, meta={"web": response.url})
def parse_juchao(response, item): PUB_URL = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse/bulletin_detail/true/' D_URL = 'http://www.cninfo.com.cn/cninfo-new/disclosure/szse/download/' dates = response.text json_str = json.loads(dates, encoding='utf-8') urls = jsonpath.jsonpath(json_str, "$..announcementId") title1 = jsonpath.jsonpath(json_str, "$..secCode") title2 = jsonpath.jsonpath(json_str, "$..secName") title3 = jsonpath.jsonpath(json_str, "$..announcementTitle") timestamp = jsonpath.jsonpath(json_str, "$..announcementTime") pdf = jsonpath.jsonpath(json_str, "$..adjunctUrl") if title2 is None: title2 = '' titles = zip(title1, title3) else: titles = zip(title1, title2, title3) url_contents = zip(urls, titles, timestamp, pdf) for url, title, time_local, pdf in url_contents: # item = {} if None in title: title = title[0] + title[2] else: title = title title = ' '.join(list(title)).replace('*', '').replace('/', '').replace('<', '').replace('>', '') \ .replace('|', '').replace(':', '').replace('"', '').replace('?', '') \ .replace('?', '') durl = D_URL + url # PDF文件下载地址 if pdf[-4:] == '.PDF': # print("==================================\n{}".format(durl)) result = session.query(NewsItemInfo).filter_by(url=PUB_URL + url, web_id=56).count() if result: # print("PDF 文件地址: {} 存在".format(PUB_URL + url)) pass else: contents = pdf_to_txt.main(url=durl, fileName=title) if len(contents) == 0: item['content'] = '请点击原文链接查看' else: item['content'] = '\n'.join(list(contents)) times = str(time_local)[0:-3] + '.' + '000' item['pub_time'] = datetime.datetime.fromtimestamp( float(times)).strftime('%Y-%m-%d %H:%M:%S') item['webname'] = '巨潮资讯' item['web'] = response.url[0:-7] item['add_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["keyword"] = keyword.get_keyword(item["content"]) item['web_id'] = 56 item['title'] = title item['url'] = PUB_URL + url yield item
def parse(self, response): url_list = re.findall(r'(/article.*?html)', response.text) for url in url_list: url = url.replace('\\', '') result = session.query(NewsItemInfo).filter_by( url='http://kan.china.com' + url, web_id=34).count() if result: # print("{} 存在".format('http://kan.china.com' + url)) pass else: yield scrapy.Request('http://kan.china.com' + url, callback=self.process_detail)
def parse(self, response): urls = response.xpath("//ul[contains(@class,'article-mini')]//li/a/@href|" "//ul[@class='nf-list']//a/@href").getall() urla = response.url for url in urls: # print("{}+++++++++++++++{}".format(urla, url)) result = session.query(NewsItemInfo).filter_by(url=url, web_id=66).count() if result: # print("{} 存在".format(url)) pass else: yield scrapy.Request(url=url, callback=self.parse_page, meta={'url': urla})