def orderbytime(self): # 按照举行时间排序,并且只选中近一年的数据 self.session = DBSession() self.info_bytime = [] # temp = self.session.query(Notification).filter( # Notification.notify_time >= datetime.datetime.now() - timedelta(days=365)).order_by( # desc(Notification.time)).all() temp = self.session.query(Notification).filter( and_( Notification.time >= datetime.datetime.now() - timedelta(days=365), # 时间 or_(Notification.title.like("%密码学%"), Notification.title.like("%信息安全%"), Notification.title.like("%security%"), Notification.title.like("%password%")) # 筛选信息 )).order_by(desc(Notification.notify_time)).all() print("按照报告举行时间由近及远排序:") for t in temp: t_dict = t.__dict__ info = { 'title': t_dict['title'], 'speaker': t_dict['speaker'], 'time': t_dict['time'], 'venue': t_dict['venue'], 'college': t_dict['college'], 'url': t_dict['url'], 'notiify_time': t_dict['notiify_time'] } self.info_bytime.append(info)
def __init__(self, seed, title_urls): self.session = DBSession() self.key_word=KeyWords() #匹配关键字 self.seed = seed self.title_urls = title_urls self.urls = list(title_urls.values()) self.information = {'title': self.key_word.title, 'speaker': self.key_word.speaker, 'time': self.key_word.time, 'venue': self.key_word.venue}
def __init__(self): self.process=CrawlerProcess(get_project_settings()) self.db=DBSession() self.init_seed_data() #设置默认值 # self.title_word=str(input('请输入学术讲座通知的匹配关键字:')) self.title = '报告题目:,学术报告:,题目,报告主题:,Title' #(默认值) self.speaker = '报告人:,主讲人:,汇报人:,Speaker' self.venue = '地点:,Address,Venue,Place' self.time = '日期:,时间:,Time'
class ArmusPipeline: def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): self.reset_data(item) try: notification = Notification(url=item['url'], title=item['title'], college=item['college'], speaker=item['speaker'], venue=item['venue'], time=item['time'], notify_time=item['notify_time']) #存入数据库Notification self.session.add(notification) self.session.commit() except Exception as e: print(e) pass return item def reset_data(self, item): #url为通知信息主码 if 'url' not in item: raise DropItem('Invalid item found: %s' % item) if 'title' not in item: item['title'] = '' if 'college' not in item: item['college'] = '' if 'speaker' not in item: item['speaker'] = '' if 'venue' not in item: item['venue'] = '' if 'time' not in item: item['time'] = '' if item['notify_time'] == '': item['notify_time'] = item['time']
class ArmusPipeline: def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): self.reset_data(item) try: notification = Notification(url=item['url'], title=item['title'], college=item['college'], speaker=item['speaker'], venue=item['venue'], time=item['time'], notify_time=item['notify_time']) #存入数据库Notification self.session.add(notification) self.session.commit() except Exception as e: print(e) self.session.rollback() pass return item def reset_data(self, item): #url为通知信息主码 if 'url' not in item: raise DropItem('Invalid item found: %s' % item) if 'title' not in item: item['title'] = '' if 'college' not in item: item['college'] = '' if 'speaker' not in item: item['speaker'] = '' if 'venue' not in item: item['venue'] = '' if 'time' not in item: item['time'] = '' if item['notify_time'] == '': if re.search('\d{4}\D{1,2}\d{1,2}\D{1,2}\d{1,2}', item['time']): nt = re.search('(\d{4})\D{1,2}(\d{1,2})\D{1,2}(\d{1,2})', item['time']) y = nt.group(1) m = nt.group(2) d = nt.group(3) item['notify_time'] = y + '-' + m + '-' + d
class Data_Spider: def __init__(self): self.process=CrawlerProcess(get_project_settings()) self.db=DBSession() self.init_seed_data() #设置默认值 # self.title_word=str(input('请输入学术讲座通知的匹配关键字:')) self.title = '报告题目:,学术报告:,题目,报告主题:,Title' #(默认值) self.speaker = '报告人:,主讲人:,汇报人:,Speaker' self.venue = '地点:,Address,Venue,Place' self.time = '日期:,时间:,Time' # 初始化seed表格数据 def init_seed_data(self): init=self.db.query(Seed).all() if len(init)==0: init_data=Seed() init_data.set_init_data(self.db) def set_college_url(self,college_url): # self.college_url=input('请输入需要爬取的学校的通知网址:') #start_url self.college_url =college_url def set_college(self,college): self.college=college def set_next_xpath(self,next_xpath): self.next_xpath=next_xpath def set_url_xpath(self,url_xpath): self.url_xpath=url_xpath def set_text_xpath(self,text_xpath): self.text_xpath=text_xpath #多个关键词用","隔开 def set_title_word(self,title_word): self.title_word=title_word def set_notify_time_xpath(self,notify_time_xpath): if len(notify_time_xpath)>0: self.notify_time_xpath=notify_time_xpath else: self.notify_time_xpath='' def set_title(self,title): if len(title)>0: self.title=self.title+','+title self.title=self.title.replace(',',',') def set_speaker(self,speaker): if len(speaker)>0: self.speaker=self.speaker+','+speaker self.speaker=self.speaker.replace(',',',') def set_venue(self,venue): if len(venue)>0: self.venue=self.venue+','+venue self.venue = self.venue.replace(',', ',') def set_time(self,time): if len(time)>0: self.time=self.time+','+time self.time = self.time.replace(',', ',') def insert_seed(self,college_url): # college_url=str(input('请输入需要爬取的学校的通知网址:')) self.set_college_url(college_url) college = str(input('请输入需要爬取的学校(学院)的名称:')) self.set_college(college) next_xpath=str(input('请输入通知网站下一页的xpath选择器路径:')) self.set_next_xpath(next_xpath) url_xpath=str(input('请输入通知网站下每个具体网站超链接的xpath路径:')) self.set_url_xpath(url_xpath) text_xpath=str(input('请输入具体通知页面下,爬取通知正文每行文字的xpath路径:')) self.set_text_xpath(text_xpath) notify_time_xpath=str(input('请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入):')) self.set_notify_time_xpath(notify_time_xpath) #上述五条信息必须输入,下面的信息可以选择性输入 title=str(input('请输入报告标题的字符匹配规则:(可选择不输入)')) self.set_title(title) speaker = str(input('请输入报告人的字符匹配规则:(可选择不输入)')) self.set_speaker(speaker) venue = str(input('请输入报告地点的字符匹配规则:(可选择不输入)')) self.set_venue(venue) time = str(input('请输入报告时间的字符匹配规则:(可选择不输入)')) self.set_time(time) seed=Seed(start_url= self.college_url,college= self.college,url_xpath= self.url_xpath, nextpage_xpath= self.next_xpath,title_word= self.title_word,notice_time_xpath= self.notify_time_xpath, title= self.title,speaker= self.speaker,venue= self.venue,time= self.time,text_xpath= self.text_xpath) self.db.add(seed) self.db.commit() return seed #单个指定学校爬取 def get_existed_urls(self,seed): existed_urls = [] urls = self.db.query(Notification.url).filter(seed.college == Notification.college).all() # existed_urls=[] if len(urls)>0: for url in urls: existed_urls.append(url[0]) return existed_urls #爬取学校学术信息通用流程 def common_spider(self,seed): urlHandle=UrlHandle() existed_urls=self.get_existed_urls(seed) urlHandle.set_start_url(seed.start_url) urlHandle.set_title_word(seed.title_word) urlHandle.set_existed_urls(existed_urls) urlHandle.set_nextpage_xpath(seed.nextpage_xpath) urlHandle.set_url_xpath(seed.url_xpath) title_urls=urlHandle.get_filte_urls() self.process.crawl(NoticeSpider,seed,title_urls) self.process.start() #单个学校学术信息爬取 def university_spider(self,seed): # college_url=self.set_college_url() # seed = self.db.query(Seed).filter(Seed.start_url == college_url).one() if seed.start_url=='https://iiis.tsinghua.edu.cn/zh/seminars/': #清华大学 self.process.crawl(ThuIiisSpider) self.process.start() else: self.common_spider(seed) # 所有学校学术信息爬取,一次性爬取所有学校会出错 def universities_spider(self): seeds=self.db.query(Seed).all() for seed in seeds: #对于每个学校直接调用单个学校爬取函数 self.university_spider(seed) def start_spider(self): is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n')) while True: print(is_one_spider) if is_one_spider in ['y','Y','yes','Yes']: college_url = str(input('请输入需要爬取的学校的通知网址:')) seed = self.db.query(Seed).filter(Seed.start_url == college_url).all() if len(seed)==0: seed=self.insert_seed(college_url) self.university_spider(seed) else: self.university_spider(seed[0]) is_continue=str(input(('爬取完成,是否继续?y/n'))) if is_continue in ['y','Y','yes','Yes']: is_one_spider = str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n')) else: break elif is_one_spider in ['n','no','No','N']: self.universities_spider() print('所有信息爬取完成!') break else: print('你的输入错误,请重新输入:') is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))
def open_spider(self, spider): self.session = DBSession()
class SeleniumSpider: def __init__(self, seed, title_urls): self.session = DBSession() self.key_word=KeyWords() #匹配关键字 self.seed = seed self.title_urls = title_urls self.urls = list(title_urls.values()) self.information = {'title': self.key_word.title, 'speaker': self.key_word.speaker, 'time': self.key_word.time, 'venue': self.key_word.venue} def start_selenium(self): for url in self.urls: try: item=self.url_parse(url) self.save_data(item) except Exception as e: print (e) pass def url_parse(self,url): browser.get(url) item = {'url': '', 'college': '', 'title': '', 'speaker': '', 'time': '', 'venue': '', 'notify_time': ''} texts = [] # 爬取通知原文的发布时间 if self.seed.notice_time_xpath != '': notify_time = browser.find_element_by_xpath(self.seed.notice_time_xpath).text else: notify_time = '' try: wait.until(EC.presence_of_element_located((By.XPATH,self.seed.text_xpath))) contents=browser.find_element_by_xpath(self.seed.text_xpath).text print('contents---->', contents) # test=contents.replace('\n',' --换行-- ') # print(test) content =contents.split('\n') for line in content: if line.replace(' ', '') != '': texts.append(line) print('texts-->', texts) # 对原文信息与我们需要的信息进行匹配 # 进行信息匹配 for (k, v_list) in self.information.items(): # 对每个匹配模式进行匹配 temp_list_k = [] for text in texts: text = text.replace('\xa0', '').replace(':', ':').replace('\r', '').replace('\n', '').strip() if '简介' not in text.replace(' ', '') and '介绍' not in text.replace(' ', ''): for word in v_list: if word in text.replace(' ', ''):#防止英文之间的空格被去除 temp = text if len(text.replace(' ', '')) > 150: if ':' in text: temp = text.split(':')[1] # 判断添加的内容是否与之前内容一样 if temp not in temp_list_k: temp_list_k.append(temp) item[k] = ','.join(temp_list_k) # 多个讲座时用‘,’隔开 except Exception as e: print(e) pass # item['url']=response.urljoin('')#获取当前url item['url'] = url item['college'] = self.seed.college # 获取大学名称 # 通知title位于主通知界面中的情况 # 实现过程有点困难 if item['title'] == '': item['title'] = list(self.title_urls.keys())[list(self.title_urls.values()).index(str(item['url']))] # 标准通知时间 yyyy-mm-dd if notify_time == '': # 通知时间位于主通知界面中的情况 notify_time = list(self.title_urls.keys())[list(self.title_urls.values()).index(item['url'])] nt = re.search(r'.*?(\d{4}).(\d+).(\d+)', notify_time) if nt is not None: notify_time = self.format_notice_time(nt) item['notify_time'] = notify_time # print(notify_time) # print(item['time']) report_time = item['time'] # 标准讲座开始时间 yyyy-mm-dd hh:mm st = re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+).+(\d+)', report_time) if st is not None: item['time'] = self.format_time(st) notification = item return notification # if not contents: # contents=contents.find_element_by_xpath('..') # print(contents.text) # else: # print('contents') # print(contents.text.replace('\n','换行')) # content=contents.text.split('\n') # print(content) def format_notice_time(self,notice_time): y=notice_time.group(1) m=notice_time.group(2) d=notice_time.group(3) if len(y)==2: y='20'+y if len(m)==1: m='0'+m if len(d)==1: d='0'+d notice_date=y+'-'+m+"-"+d # notice_time_=datetime.strptime(notice_date,'%Y-%m-%d') return notice_date def format_time(self,time): date=self.format_notice_time(time) h=time.group(4) m=time.group(5) if len(h)==1: h='0'+h if len(m)==1: m='0'+m print(date+' '+h+':'+m,'<---------讲座时间') return date+' '+h+':'+m def save_data(self,item): self.reset_data(item) if '-' not in item['time']: item['time']=self.format_time_again(item['time'],item['notify_time']) try: # time=self.format_time_again(item['time'],item['notify_time']) # item['time']=time if '-' not in item['time']: item['time'] = self.format_time_again(item['time'], item['notify_time']) notification = Notification(url=item['url'], title=item['title'], college=item['college'], speaker=item['speaker'], venue=item['venue'], time=item['time'], notify_time=item['notify_time']) # 存入数据库Notification self.session.add(notification) #sqlalchemy self.session.commit() except Exception as e: print(e) self.session.rollback() pass def reset_data(self, item): # url为通知信息主码 if 'url' not in item: print('Invalid item found: %s' % item) if 'title' not in item: item['title'] = '' if 'college' not in item: item['college'] = '' if 'speaker' not in item: item['speaker'] = '' if 'venue' not in item: item['venue'] = '' if 'time' not in item: item['time'] = '' if item['notify_time'] == '': item['notify_time'] = item['time'] def format_time_again(self,time,notify_time): if re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})',time): # if re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?',time): st=re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})',time) y=notify_time.split('-')[0] mon=st.group(1) d=st.group(2) h=st.group(3) if '下午' in time: if int(h)<12: h+=12 if re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?(\d{1,2})',time): st=re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?(\d{1,2})',time) min=st.group(4) else: min='00' report_time=y+'-'+mon+'-'+d+' '+h+':'+min return report_time elif re.search(r'\D*?(\d{4})年(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})',time): # if re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?',time): st=re.search(r'\D*?(\d{4})年(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})',time) y=st.group(1) mon=st.group(2) d=st.group(3) h=st.group(4) if '下午' in time: if int(h)<12: h+=12 if re.search(r'\D*?(\d{4})年(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?(\d{1,2})',time): st=re.search(r'\D*?(\d{4})年(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?(\d{1,2})',time) min=st.group(5) else: min='00' report_time=y+'-'+mon+'-'+d+' '+h+':'+min return report_time # def format_time_again(self,time,notify_time): # #匹配****年**月**日(下午)HH:MM # if re.search(r'.*?(\d{4}).(\d{1,2}).(\d{1,2}).*?(\d{1,2}).(\d{1,2})\D', time): # st=re.search(r'.*?(\d{4}).(\d+).(\d+).*?(\d{1,2}).+(\d{1,2})\D', time) # y = st.group(1) # mon = st.group(2) # d = st.group(3) # if len(y) == 2: # y = '20' + y # if len(mon) == 1: # mon = '0' + mon # if len(d) == 1: # d = '0' + d # h = st.group(4) # if re.search(r'.*?(\d{4}).(\d+).(\d+).*?下午(\d{1,2}).+(\d{1,2})\D', time): # if int(h)<12: # h=str(int(h)+12) # min = st.group(5) # if len(h) == 1: # h = '0' + h # if len(min) == 1: # min = '0' + min # # 匹配****年**月**日(下午)HH # elif re.search(r'.*?(\d{4}).(\d+).(\d{1,2}).*?(\d{1,2})\D', time): # st=re.search(r'.*?(\d{4}).(\d+).(\d{1,2}).*?(\d{1,2})\D', time) # y = st.group(1) # mon = st.group(2) # d = st.group(3) # if len(y) == 2: # y = '20' + y # if len(mon) == 1: # mon = '0' + mon # if len(d) == 1: # d = '0' + d # h = st.group(4) # if re.search(r'.*?(\d{4}).(\d+).(\d{1,2}).*?下午(\d{1,2})\D', time): # h=str(int(h)+12) # if len(h) == 1: # h = '0' + h # min='00' # # 匹配**月**日(下午)HH:MM # elif re.search(r'.*?(\d{1,2}).(\d+).*?(\d+).+(\d{1,2})\D',time): # st=re.search(r'.*?(\d{1,2}).(\d+).*?(\d+).+(\d{1,2})\D',time) # y=notify_time.split('-')[0] # mon=st.group(1) # d=st.group(2) # h=st.group(3) # min=st.group(4) # if re.search(r'.*?(\d{1,2}).(\d+).*?下午(\d+).+(\d{1,2})\D',time): # h=str(int(h)+12) # if len(y)==1: # y='0'+y # if len(mon) == 1: # mon = '0' + mon # if len(d) == 1: # d = '0' + d # if len(h) == 1: # h = '0' + h # if len(min) == 1: # min = '0' + min # # 匹配**月**日(下午)HH # elif re.search(r'.*?(\d{1,2}).(\d+).*?(\d{1,2})\D',time): # st=re.search(r'.*?(\d{1,2}).(\d+).*?(\d{1,2})\D',time) # y=notify_time.split('-')[0] # mon=st.group(1) # d=st.group(2) # h=st.group(3) # if re.search(r'.*?(\d{1,2}).(\d+).*?下午(\d{1,2})\D',time): # h=str(int(h)+12) # if len(y)==1: # y='0'+y # if len(mon) == 1: # mon = '0' + mon # if len(d) == 1: # d = '0' + d # if len(h) == 1: # h = '0' + h # min='00' # #(r'.*?(\d+).(\d+)..*?(\d+).+(\d*)') # # (r'.*?(\d+).(\d+)..*?下午(\d+).*?+(\d*)') # elif re.search(r'.*?(\d{4}).(\d{1,2}).(\d{1,2})'): # y = time.group(1) # m = time.group(2) # d = time.group(3) # if len(y) == 2: # y = '20' + y # if len(m) == 1: # m = '0' + m # if len(d) == 1: # d = '0' + d # notice_date = y + '-' + m + "-" + d # # notice_time_ = datetime.strptime(notice_date, '%Y-%m-%d') # return notice_date # else: # y='2000' # mon='01' # d='01' # h='00' # min='00' # report_time_=y+'-'+mon+'-'+d+' '+h+":"+min # # time_=datetime.strptime(report_time_,'%Y-%m-%d %H:%M') # return report_time_
def __init__(self, *a, **kw): super().__init__(*a, **kw) self.college = '清华大学交叉信息研究院' self.db = DBSession()
class ThuIiisSpider(CrawlSpider): name = 'thu_iiis' allowed_domains = ['iiis.tsinghua.edu.cn'] start_urls = ['https://iiis.tsinghua.edu.cn/list-265-1.html'] domain_url = 'https://iiis.tsinghua.edu.cn' # custom_settings = { # 'ITEM_PIPELINES': {'armus1.pipelines.ArmusPipeline': 300} # } rules = ( Rule(LinkExtractor( allow=r'https://iiis.tsinghua.edu.cn/list-265-\d{1,2}.html'), callback='parse_item', follow=None), # Rule(LinkExtractor(restrict_xpaths='.//ul[@class="pagination"]/li[last()-1]/a'),callback=None,follow=True) ) def __init__(self, *a, **kw): super().__init__(*a, **kw) self.college = '清华大学交叉信息研究院' self.db = DBSession() def parse_item(self, response): #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() reports = response.xpath('.//tbody//tr') for report in reports: notice_url = self.domain_url + report.xpath('./td/a/@href').get() if self.is_existed_urls(notice_url): continue else: notify_time = '' title = report.xpath('./td/a/text()').getall() title = ' '.join(''.join(title).split()) # print(notice_url) speaker = report.xpath('./td[2]//text()').getall() speaker = ' '.join(''.join(speaker).split()) time = report.xpath('./td[3]//text()').get() # time=self.format_time(time,notify_time) venue = report.xpath('./td[4]//text()').get() item = Armus_Item(title=title, speaker=speaker, venue=venue, time=time, url=notice_url, college=self.college, notify_time=notify_time) yield item next_page = response.xpath( "//ul[@class='pagination']/li[last()-1]/a/@href").extract_first() if next_page is not None: next_page = response.urljoin(next_page) yield scrapy.Request(next_page, callback=self.parse_item) def is_existed_urls(self, notice_url): url = self.db.query( Notification.url).filter(notice_url == Notification.college).all() # existed_urls=[] if len(url): return True else: return False def format_time(self, time, notify_time): #匹配****年**月**日(下午)HH:MM if re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+).+(\d+)', time): st = re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+).+(\d+)', time) y = st.group(1) mon = st.group(2) d = st.group(3) if len(y) == 2: y = '20' + y if len(mon) == 1: mon = '0' + mon if len(d) == 1: d = '0' + d h = st.group(4) if re.search(r'.*?(\d{4}).(\d+).(\d+)..*?下午(\d+).+(\d+)', time): h = str(int(h) + 12) min = st.group(5) if len(h) == 1: h = '0' + h if len(min) == 1: min = '0' + min # 匹配****年**月**日(下午)HH elif re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+)', time): st = re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+)', time) y = st.group(1) mon = st.group(2) d = st.group(3) if len(y) == 2: y = '20' + y if len(mon) == 1: mon = '0' + mon if len(d) == 1: d = '0' + d h = st.group(4) if re.search(r'.*?(\d{4}).(\d+).(\d+)..*?下午(\d+).', time): h = str(int(h) + 12) if len(h) == 1: h = '0' + h min = '00' # 匹配**月**日(下午)HH:MM elif re.search(r'.*?(\d{1,2}).(\d+)..*?(\d+).+(\d*)', time): st = re.search(r'.*?(\d{1,2}).(\d+)..*?(\d+).+(\d*)', time) y = notify_time.split('-')[0] mon = st.group(1) d = st.group(2) h = st.group(3) min = st.group(4) if re.search(r'.*?(\d{1,2}).(\d+)..*?下午(\d+).+(\d*)', time): h = str(int(h) + 12) if len(y) == 1: y = '0' + y if len(mon) == 1: mon = '0' + mon if len(d) == 1: d = '0' + d if len(h) == 1: h = '0' + h if len(min) == 1: min = '0' + min # 匹配**月**日(下午)HH elif re.search(r'.*?(\d{1,2}).(\d+)..*?(\d+).', time): st = re.search(r'.*?(\d{1,2}).(\d+)..*?(\d+).', time) y = notify_time.split('-')[0] mon = st.group(1) d = st.group(2) h = st.group(3) if re.search(r'.*?(\d{1,2}).(\d+)..*?下午(\d+).', time): h = str(int(h) + 12) if len(y) == 1: y = '0' + y if len(mon) == 1: mon = '0' + mon if len(d) == 1: d = '0' + d if len(h) == 1: h = '0' + h min = '00' #(r'.*?(\d+).(\d+)..*?(\d+).+(\d*)') # (r'.*?(\d+).(\d+)..*?下午(\d+).*?+(\d*)') else: y = '2000' mon = '01' d = '01' h = '00' min = '00' report_time_ = y + '-' + mon + '-' + d + ' ' + h + ":" + min # time_=datetime.strptime(report_time_,'%Y-%m-%d %H:%M') return report_time_
class Data_Spider: def __init__(self): self.process=CrawlerProcess(get_project_settings()) self.db=DBSession() self.init_seed_data() #设置默认值 # self.title_word=str(input('请输入学术讲座通知的匹配关键字:')) self.title = '报告题目:,学术报告:,题目,报告主题:,Title' #(默认值) self.speaker = '报告人:,主讲人:,汇报人:,Speaker,报告专家' self.venue = '地点:,Address,Venue,Place' self.time = '日期:,时间:,Time' self.title_word='' # 初始化seed表格数据 def init_seed_data(self): init=self.db.query(Seed).all() if len(init)==0: init_data=Seed() init_data.set_init_data(self.db) def set_college_url(self,college_url): # self.college_url=input('请输入需要爬取的学校的通知网址:') #start_url self.college_url =college_url def set_college(self,college): self.college=college def set_next_xpath(self,next_xpath): self.next_xpath=next_xpath def set_url_xpath(self,url_xpath): self.url_xpath=url_xpath def set_text_xpath(self,text_xpath): self.text_xpath=text_xpath #多个关键词用","隔开 def set_title_word(self): self.title_word='' def set_notify_time_xpath(self,notify_time_xpath): if len(notify_time_xpath)>0: self.notify_time_xpath=notify_time_xpath else: self.notify_time_xpath='' # 关键字设置,现已废除 # def set_title(self,title): # if len(title)>0: # self.title=self.title+','+title # self.title=self.title.replace(',',',') # def set_speaker(self,speaker): # if len(speaker)>0: # self.speaker=self.speaker+','+speaker # self.speaker=self.speaker.replace(',',',') # def set_venue(self,venue): # if len(venue)>0: # self.venue=self.venue+','+venue # self.venue = self.venue.replace(',', ',') # def set_time(self,time): # if len(time)>0: # self.time=self.time+','+time # self.time = self.time.replace(',', ',') # def insert_seed(self,college_url): # def insert_seed_test(self): # self.insert_seed() def insert_seed(self,db): # college_url=str(input('请输入需要爬取的学校的通知网址:')) # 设置图形化界面后忽略这一部分 # self.set_college_url(college_url) # college = str(input('请输入需要爬取的学校(学院)的名称:')) # self.set_college(college) # next_xpath=str(input('请输入通知网站下一页的xpath选择器路径:')) # self.set_next_xpath(next_xpath) # url_xpath=str(input('请输入通知网站下每个具体网站超链接的xpath路径:')) # self.set_url_xpath(url_xpath) # text_xpath=str(input('请输入具体通知页面下,爬取通知正文每行文字的xpath路径:')) # self.set_text_xpath(text_xpath) # notify_time_xpath=str(input('请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入):')) # self.set_notify_time_xpath(notify_time_xpath) # #上述五条信息必须输入,下面的信息可以选择性输入 # title_word=str(input('请输入总通知页面下通知标题的字符匹配规则:(可选择不输入)')) # self.title_word=title_word # title=str(input('请输入报告标题的字符匹配规则:(可选择不输入)')) # self.set_title(title) # speaker = str(input('请输入报告人的字符匹配规则:(可选择不输入)')) # self.set_speaker(speaker) # venue = str(input('请输入报告地点的字符匹配规则:(可选择不输入)')) # self.set_venue(venue) # time = str(input('请输入报告时间的字符匹配规则:(可选择不输入)')) # self.set_time(time) try: seed=Seed(start_url= self.college_url,college= self.college,url_xpath= self.url_xpath, nextpage_xpath= self.next_xpath,title_word= self.title_word,notice_time_xpath= self.notify_time_xpath, # title=self.title, speaker=self.speaker, venue=self.venue, time=self.time, text_xpath= self.text_xpath) db.add(seed) db.commit() except Exception as e: print(e) db.rollback() print('插入数据失败') #单个指定学校爬取 def get_existed_urls(self,seed): existed_urls = [] urls = self.db.query(Notification.url).filter(seed.college == Notification.college).all() # existed_urls=[] if len(urls)>0: for url in urls: existed_urls.append(url[0]) return existed_urls #爬取学校学术信息通用流程 def common_spider(self,seed): urlHandle=UrlHandle() existed_urls=self.get_existed_urls(seed) urlHandle.set_start_url(seed.start_url) urlHandle.set_title_word(seed.title_word) urlHandle.set_existed_urls(existed_urls) urlHandle.set_nextpage_xpath(seed.nextpage_xpath) urlHandle.set_url_xpath(seed.url_xpath) title_urls=urlHandle.get_filte_urls() selenium_spider = SeleniumSpider(seed, title_urls) selenium_spider.start_selenium() # self.process.crawl(NoticeSpider,seed,title_urls) # self.process.start() #单个学校学术信息爬取 def university_spider(self,seed): # college_url=self.set_college_url() # seed = self.db.query(Seed).filter(Seed.start_url == college_url).one() if seed.start_url=='https://iiis.tsinghua.edu.cn/zh/seminars/': #清华大学 self.process.crawl(ThuIiisSpider) self.process.start() else: self.common_spider(seed) # 所有学校学术信息爬取,一次性爬取所有学校会出错 def universities_spider(self): seeds=self.db.query(Seed).all() for seed in seeds: #对于每个学校直接调用单个学校爬取函数 self.university_spider(seed) # def start_spider(self): # is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n')) # while True: # print(is_one_spider) # if is_one_spider in ['y','Y','yes','Yes']: # college_url = str(input('请输入需要爬取的学校的通知网址:')) # seed = self.db.query(Seed).filter(Seed.start_url == college_url).all() # if len(seed)==0: # seed=self.insert_seed(college_url) # self.university_spider(seed) # else: # self.university_spider(seed[0]) # is_continue=str(input(('爬取完成,是否继续?y/n'))) # if is_continue in ['y','Y','yes','Yes']: # is_one_spider = str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n')) # else: # break # elif is_one_spider in ['n','no','No','N']: # self.universities_spider() # print('所有信息爬取完成!') # break # else: # print('你的输入错误,请重新输入:') # is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n')) #放在主程序执行 # spider=Data_Spider() # spider.start_spider() #请输入需要爬取的学校的通知网址:http://sist.swjtu.edu.cn/list.do?action=news&navId=40 # 请输入需要爬取的学校(学院)的名称:西南交通大学信息科学与技术学院 # 请输入通知网站下一页的xpath选择器路径://div[@class="tableFootLeft"]//a[text()="下一页"] # 请输入通知网站下每个具体网站超链接的xpath路径://*[@id="rightPageContent"]/dl//dd # 请输入具体通知页面下,爬取通知正文每行文字的xpath路径://*[@id="newsBody"] # 请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入)://*[@id="newsInfo"] # http://cs.gzu.edu.cn/forum.php?mod=forumdisplay&fid=57&page=1 # 贵州大学计算机科学与技术学院 # url_xpath=//*[@id="newsList"]//p # nextpage=//*[@id="bmbw0pgscl"]/div//a[text()='下一页'] # notify_time=//*[@id="ct"]/div[1]/div/div[1]/p # 通知全文=//td[@class="t_f"]
class Notification(Base): """the class map to table of db_model.notifications""" __tablename__ = 'notifications' url = Column(String(255), primary_key=True) #通知全文的url title = Column(String(255)) #讲座题目 college = Column(String(255)) #讲座所在大学 speaker = Column(String(255)) #讲座演讲人 venue = Column(String(255)) #讲座地点 time = Column(String(255)) #讲座时间 notify_time = Column(String(20)) #通知发布时间 # def __init__(self): scrapy框架+selenium # self.session = DBSession() # Base.metadata.create_all(engine) # self.info_byrelease=[] #按照通知时间 # self.info_bytime=[] #按照讲座时间 def orderbyrelease(self): # 按照发布时间排序,并且只选中近一年的数据 # session = DBSession() self.session = DBSession() self.info_byrelease = [] # temp = self.session.query(Notification).filter( # Notification.notify_time >= datetime.datetime.now() - timedelta(days=365)).order_by( # desc(Notification.notify_time)).all() temp = self.session.query(Notification).filter( and_( Notification.notify_time >= datetime.datetime.now() - timedelta(days=365), # 时间 or_(Notification.title.like("%密码学%"), Notification.title.like("%信息安全%"), Notification.title.like("%security%"), Notification.title.like("%password%")) # 筛选信息 )).order_by(desc(Notification.notify_time)).all() print("按照通知发布时间由近及远排序:") for t in temp: t_dict = t.__dict__ info = { 'title': t_dict['title'], 'speaker': t_dict['speaker'], 'time': t_dict['time'], 'venue': t_dict['venue'], 'college': t_dict['college'], 'url': t_dict['url'], 'notiify_time': t_dict['notiify_time'] } self.info_byrelease.append(info) print(self.info_byrelease) # print("讲座标题:", t_dict['title']) # print("报告人:", t_dict['speaker']) # print("时间:", t_dict['time']) # print("地点:", t_dict['venue']) # print("大学:", t_dict['college']) # print("通知全文链接:", t_dict['url']) # print() def orderbytime(self): # 按照举行时间排序,并且只选中近一年的数据 self.session = DBSession() self.info_bytime = [] # temp = self.session.query(Notification).filter( # Notification.notify_time >= datetime.datetime.now() - timedelta(days=365)).order_by( # desc(Notification.time)).all() temp = self.session.query(Notification).filter( and_( Notification.time >= datetime.datetime.now() - timedelta(days=365), # 时间 or_(Notification.title.like("%密码学%"), Notification.title.like("%信息安全%"), Notification.title.like("%security%"), Notification.title.like("%password%")) # 筛选信息 )).order_by(desc(Notification.notify_time)).all() print("按照报告举行时间由近及远排序:") for t in temp: t_dict = t.__dict__ info = { 'title': t_dict['title'], 'speaker': t_dict['speaker'], 'time': t_dict['time'], 'venue': t_dict['venue'], 'college': t_dict['college'], 'url': t_dict['url'], 'notiify_time': t_dict['notiify_time'] } self.info_bytime.append(info) # print(self.info_bytime) # print("讲座标题:", t_dict['title']) # print("报告人:", t_dict['speaker']) # print("时间:", t_dict['time']) # print("地点:", t_dict['venue']) # print("大学:", t_dict['college']) # print("通知全文链接:", t_dict['url']) # print() def get_info_bytime(self): #获取按讲座时间排序的讲座信息 return self.info_bytime def get_info_byrelease(self): #获取按通知时间排序的讲座信息 return self.info_byrelease
from db_model.seeds import Seed from db_model.db_config import DBSession from db_model.notifications import Notification from db_model.db_config import Seed from db_model.db_config import Notification from UrlHandle import UrlHandle from armus1.spiders.notice import NoticeSpider from armus1.spiders.thu_iiis import ThuIiisSpider # scrapy api from scrapy.utils.project import get_project_settings from scrapy.crawler import CrawlerProcess process = CrawlerProcess(get_project_settings()) db = DBSession() #scut_se=Seed(start_url='http://www2.scut.edu.cn/sse/xshd/list.htm',college='华南理工大学软件学院', #url_xpath='.//*[@class="news_ul"]//li', #nextpage_xpath='//*[@id="wp_paging_w67"]/ul/li[2]/a[3]', #title_word='举办,举行', #notice_time_xpath='//*[@id="page-content-wrapper"]/div[2]/div/div/div[2]/div/div/div/p/span[1]', #title='汇报主题:,报告题目:,题目:,Title:,报告主题:',speaker='汇报人:,报告人:,Speaker', #venue='地点:,venue:,Address:',time='Time:,时间:', #text_xpath='//*[@id="page-content-wrapper"]/div[2]/div/div/div[2]/div/div/div/div[2]/div/div//p') #jnu_xx=Seed(start_url='https://xxxy2016.jnu.edu.cn/Category_37/Index.aspx', #college='暨南大学信息科学技术学院/网络空间安全学院', #url_xpath='//*[@id="mainContent"]/div[2]/ul//li', #nextpage_xpath='//*[@id="pe100_page_通用信息列表_普通式"]/div/a[9]', #title_word='学术讲座', #notice_time_xpath='//*[@id="mainContent"]/div[2]/div/div[1]/span[3]',