Пример #1
0
 def orderbytime(self):  # 按照举行时间排序,并且只选中近一年的数据
     self.session = DBSession()
     self.info_bytime = []
     # temp = self.session.query(Notification).filter(
     #     Notification.notify_time >= datetime.datetime.now() - timedelta(days=365)).order_by(
     #     desc(Notification.time)).all()
     temp = self.session.query(Notification).filter(
         and_(
             Notification.time >=
             datetime.datetime.now() - timedelta(days=365),  # 时间
             or_(Notification.title.like("%密码学%"),
                 Notification.title.like("%信息安全%"),
                 Notification.title.like("%security%"),
                 Notification.title.like("%password%"))  # 筛选信息
         )).order_by(desc(Notification.notify_time)).all()
     print("按照报告举行时间由近及远排序:")
     for t in temp:
         t_dict = t.__dict__
         info = {
             'title': t_dict['title'],
             'speaker': t_dict['speaker'],
             'time': t_dict['time'],
             'venue': t_dict['venue'],
             'college': t_dict['college'],
             'url': t_dict['url'],
             'notiify_time': t_dict['notiify_time']
         }
         self.info_bytime.append(info)
Пример #2
0
 def __init__(self, seed, title_urls):
     self.session = DBSession()
     self.key_word=KeyWords()    #匹配关键字
     self.seed = seed
     self.title_urls = title_urls
     self.urls = list(title_urls.values())
     self.information = {'title': self.key_word.title, 'speaker': self.key_word.speaker,
                         'time': self.key_word.time, 'venue': self.key_word.venue}
Пример #3
0
 def __init__(self):
     self.process=CrawlerProcess(get_project_settings())
     self.db=DBSession()
     self.init_seed_data()
     #设置默认值
     # self.title_word=str(input('请输入学术讲座通知的匹配关键字:'))
     self.title = '报告题目:,学术报告:,题目,报告主题:,Title'        #(默认值)
     self.speaker = '报告人:,主讲人:,汇报人:,Speaker'
     self.venue = '地点:,Address,Venue,Place'
     self.time = '日期:,时间:,Time'
Пример #4
0
class ArmusPipeline:
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        self.reset_data(item)
        try:
            notification = Notification(url=item['url'],
                                        title=item['title'],
                                        college=item['college'],
                                        speaker=item['speaker'],
                                        venue=item['venue'],
                                        time=item['time'],
                                        notify_time=item['notify_time'])
            #存入数据库Notification
            self.session.add(notification)
            self.session.commit()
        except Exception as e:
            print(e)
            pass
        return item

    def reset_data(self, item):
        #url为通知信息主码
        if 'url' not in item:
            raise DropItem('Invalid item found: %s' % item)

        if 'title' not in item:
            item['title'] = ''

        if 'college' not in item:
            item['college'] = ''

        if 'speaker' not in item:
            item['speaker'] = ''

        if 'venue' not in item:
            item['venue'] = ''

        if 'time' not in item:
            item['time'] = ''

        if item['notify_time'] == '':
            item['notify_time'] = item['time']
Пример #5
0
class ArmusPipeline:
    def open_spider(self, spider):
        self.session = DBSession()

    def process_item(self, item, spider):
        self.reset_data(item)
        try:
            notification = Notification(url=item['url'],
                                        title=item['title'],
                                        college=item['college'],
                                        speaker=item['speaker'],
                                        venue=item['venue'],
                                        time=item['time'],
                                        notify_time=item['notify_time'])
            #存入数据库Notification
            self.session.add(notification)
            self.session.commit()
        except Exception as e:
            print(e)
            self.session.rollback()
            pass
        return item

    def reset_data(self, item):
        #url为通知信息主码
        if 'url' not in item:
            raise DropItem('Invalid item found: %s' % item)

        if 'title' not in item:
            item['title'] = ''

        if 'college' not in item:
            item['college'] = ''

        if 'speaker' not in item:
            item['speaker'] = ''

        if 'venue' not in item:
            item['venue'] = ''

        if 'time' not in item:
            item['time'] = ''

        if item['notify_time'] == '':
            if re.search('\d{4}\D{1,2}\d{1,2}\D{1,2}\d{1,2}', item['time']):
                nt = re.search('(\d{4})\D{1,2}(\d{1,2})\D{1,2}(\d{1,2})',
                               item['time'])
                y = nt.group(1)
                m = nt.group(2)
                d = nt.group(3)
                item['notify_time'] = y + '-' + m + '-' + d
Пример #6
0
class Data_Spider:
    def __init__(self):
        self.process=CrawlerProcess(get_project_settings())
        self.db=DBSession()
        self.init_seed_data()
        #设置默认值
        # self.title_word=str(input('请输入学术讲座通知的匹配关键字:'))
        self.title = '报告题目:,学术报告:,题目,报告主题:,Title'        #(默认值)
        self.speaker = '报告人:,主讲人:,汇报人:,Speaker'
        self.venue = '地点:,Address,Venue,Place'
        self.time = '日期:,时间:,Time'

    # 初始化seed表格数据
    def init_seed_data(self):
        init=self.db.query(Seed).all()
        if len(init)==0:
            init_data=Seed()
            init_data.set_init_data(self.db)

    def set_college_url(self,college_url):
        # self.college_url=input('请输入需要爬取的学校的通知网址:')   #start_url
        self.college_url =college_url
    def set_college(self,college):
        self.college=college

    def set_next_xpath(self,next_xpath):
        self.next_xpath=next_xpath

    def set_url_xpath(self,url_xpath):
        self.url_xpath=url_xpath

    def set_text_xpath(self,text_xpath):
        self.text_xpath=text_xpath

    #多个关键词用","隔开
    def set_title_word(self,title_word):
        self.title_word=title_word

    def set_notify_time_xpath(self,notify_time_xpath):
        if len(notify_time_xpath)>0:
            self.notify_time_xpath=notify_time_xpath
        else:
            self.notify_time_xpath=''

    def set_title(self,title):
        if len(title)>0:
            self.title=self.title+','+title
        self.title=self.title.replace(',',',')
    def set_speaker(self,speaker):
        if len(speaker)>0:
            self.speaker=self.speaker+','+speaker
        self.speaker=self.speaker.replace(',',',')
    def set_venue(self,venue):
        if len(venue)>0:
            self.venue=self.venue+','+venue
        self.venue = self.venue.replace(',', ',')
    def set_time(self,time):
        if len(time)>0:
            self.time=self.time+','+time
        self.time = self.time.replace(',', ',')

    def insert_seed(self,college_url):
        # college_url=str(input('请输入需要爬取的学校的通知网址:'))
        self.set_college_url(college_url)
        college = str(input('请输入需要爬取的学校(学院)的名称:'))
        self.set_college(college)
        next_xpath=str(input('请输入通知网站下一页的xpath选择器路径:'))
        self.set_next_xpath(next_xpath)
        url_xpath=str(input('请输入通知网站下每个具体网站超链接的xpath路径:'))
        self.set_url_xpath(url_xpath)
        text_xpath=str(input('请输入具体通知页面下,爬取通知正文每行文字的xpath路径:'))
        self.set_text_xpath(text_xpath)
        notify_time_xpath=str(input('请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入):'))
        self.set_notify_time_xpath(notify_time_xpath)
        #上述五条信息必须输入,下面的信息可以选择性输入
        title=str(input('请输入报告标题的字符匹配规则:(可选择不输入)'))
        self.set_title(title)
        speaker = str(input('请输入报告人的字符匹配规则:(可选择不输入)'))
        self.set_speaker(speaker)
        venue = str(input('请输入报告地点的字符匹配规则:(可选择不输入)'))
        self.set_venue(venue)
        time = str(input('请输入报告时间的字符匹配规则:(可选择不输入)'))
        self.set_time(time)
        seed=Seed(start_url= self.college_url,college= self.college,url_xpath= self.url_xpath,
                     nextpage_xpath= self.next_xpath,title_word= self.title_word,notice_time_xpath= self.notify_time_xpath,
                     title= self.title,speaker= self.speaker,venue= self.venue,time= self.time,text_xpath= self.text_xpath)
        self.db.add(seed)
        self.db.commit()
        return seed

    #单个指定学校爬取
    def get_existed_urls(self,seed):
        existed_urls = []
        urls = self.db.query(Notification.url).filter(seed.college == Notification.college).all()
        # existed_urls=[]
        if len(urls)>0:
            for url in urls:
                existed_urls.append(url[0])
        return existed_urls

    #爬取学校学术信息通用流程
    def common_spider(self,seed):
        urlHandle=UrlHandle()
        existed_urls=self.get_existed_urls(seed)
        urlHandle.set_start_url(seed.start_url)
        urlHandle.set_title_word(seed.title_word)
        urlHandle.set_existed_urls(existed_urls)
        urlHandle.set_nextpage_xpath(seed.nextpage_xpath)
        urlHandle.set_url_xpath(seed.url_xpath)
        title_urls=urlHandle.get_filte_urls()
        self.process.crawl(NoticeSpider,seed,title_urls)
        self.process.start()

    #单个学校学术信息爬取
    def university_spider(self,seed):
        # college_url=self.set_college_url()
        # seed = self.db.query(Seed).filter(Seed.start_url == college_url).one()
        if seed.start_url=='https://iiis.tsinghua.edu.cn/zh/seminars/':    #清华大学
            self.process.crawl(ThuIiisSpider)
            self.process.start()
        else:
            self.common_spider(seed)

    # 所有学校学术信息爬取,一次性爬取所有学校会出错
    def universities_spider(self):
        seeds=self.db.query(Seed).all()
        for seed in seeds:
            #对于每个学校直接调用单个学校爬取函数
            self.university_spider(seed)

    def start_spider(self):
        is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))
        while True:
            print(is_one_spider)
            if is_one_spider in ['y','Y','yes','Yes']:
                college_url = str(input('请输入需要爬取的学校的通知网址:'))
                seed = self.db.query(Seed).filter(Seed.start_url == college_url).all()
                if len(seed)==0:
                    seed=self.insert_seed(college_url)
                    self.university_spider(seed)
                else:
                    self.university_spider(seed[0])
                is_continue=str(input(('爬取完成,是否继续?y/n')))
                if is_continue in ['y','Y','yes','Yes']:
                    is_one_spider = str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))
                else:
                    break
            elif is_one_spider in ['n','no','No','N']:
                self.universities_spider()
                print('所有信息爬取完成!')
                break
            else:
                print('你的输入错误,请重新输入:')
                is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))
Пример #7
0
 def open_spider(self, spider):
     self.session = DBSession()
Пример #8
0
class SeleniumSpider:
    def __init__(self, seed, title_urls):
        self.session = DBSession()
        self.key_word=KeyWords()    #匹配关键字
        self.seed = seed
        self.title_urls = title_urls
        self.urls = list(title_urls.values())
        self.information = {'title': self.key_word.title, 'speaker': self.key_word.speaker,
                            'time': self.key_word.time, 'venue': self.key_word.venue}
    def start_selenium(self):
        for url in self.urls:
            try:
                item=self.url_parse(url)
                self.save_data(item)
            except Exception as e:
                print (e)
                pass
    def url_parse(self,url):
        browser.get(url)
        item = {'url': '', 'college': '', 'title': '', 'speaker': '', 'time': '', 'venue': '', 'notify_time': ''}
        texts = []
        # 爬取通知原文的发布时间
        if self.seed.notice_time_xpath != '':
            notify_time = browser.find_element_by_xpath(self.seed.notice_time_xpath).text
        else:
            notify_time = ''
        try:
            wait.until(EC.presence_of_element_located((By.XPATH,self.seed.text_xpath)))
            contents=browser.find_element_by_xpath(self.seed.text_xpath).text
            print('contents---->', contents)
            # test=contents.replace('\n',' --换行-- ')
            # print(test)
            content =contents.split('\n')
            for line in content:
                if line.replace(' ', '') != '':
                    texts.append(line)
            print('texts-->', texts)
            # 对原文信息与我们需要的信息进行匹配
            # 进行信息匹配
            for (k, v_list) in self.information.items():
                # 对每个匹配模式进行匹配
                temp_list_k = []
                for text in texts:
                    text = text.replace('\xa0', '').replace(':', ':').replace('\r', '').replace('\n', '').strip()
                    if '简介' not in text.replace(' ', '') and '介绍' not in text.replace(' ', ''):
                        for word in v_list:
                            if word in text.replace(' ', ''):#防止英文之间的空格被去除
                                temp = text
                                if len(text.replace(' ', '')) > 150:
                                    if ':' in text:
                                        temp = text.split(':')[1]
                                # 判断添加的内容是否与之前内容一样
                                if temp not in temp_list_k:
                                    temp_list_k.append(temp)
                item[k] = ','.join(temp_list_k)  # 多个讲座时用‘,’隔开
        except Exception as e:
            print(e)
            pass
        # item['url']=response.urljoin('')#获取当前url
        item['url'] = url
        item['college'] = self.seed.college  # 获取大学名称

        # 通知title位于主通知界面中的情况
        # 实现过程有点困难
        if item['title'] == '':
            item['title'] = list(self.title_urls.keys())[list(self.title_urls.values()).index(str(item['url']))]
        # 标准通知时间     yyyy-mm-dd
        if notify_time == '':  # 通知时间位于主通知界面中的情况
            notify_time = list(self.title_urls.keys())[list(self.title_urls.values()).index(item['url'])]
        nt = re.search(r'.*?(\d{4}).(\d+).(\d+)', notify_time)
        if nt is not None:
            notify_time = self.format_notice_time(nt)
        item['notify_time'] = notify_time
        # print(notify_time)
        # print(item['time'])
        report_time = item['time']
        # 标准讲座开始时间   yyyy-mm-dd hh:mm
        st = re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+).+(\d+)', report_time)
        if st is not None:
            item['time'] = self.format_time(st)
        notification = item
        return notification
        # if not contents:
        #     contents=contents.find_element_by_xpath('..')
        #     print(contents.text)
        # else:
        #     print('contents')
        # print(contents.text.replace('\n','换行'))
        # content=contents.text.split('\n')
        # print(content)

    def format_notice_time(self,notice_time):
        y=notice_time.group(1)
        m=notice_time.group(2)
        d=notice_time.group(3)
        if len(y)==2:
            y='20'+y
        if len(m)==1:
            m='0'+m
        if len(d)==1:
            d='0'+d
        notice_date=y+'-'+m+"-"+d
        # notice_time_=datetime.strptime(notice_date,'%Y-%m-%d')
        return notice_date

    def format_time(self,time):
        date=self.format_notice_time(time)
        h=time.group(4)
        m=time.group(5)
        if len(h)==1:
            h='0'+h
        if len(m)==1:
            m='0'+m
        print(date+' '+h+':'+m,'<---------讲座时间')
        return date+' '+h+':'+m

    def save_data(self,item):
        self.reset_data(item)
        if '-' not in item['time']:
            item['time']=self.format_time_again(item['time'],item['notify_time'])
        try:
            # time=self.format_time_again(item['time'],item['notify_time'])
            # item['time']=time
            if '-' not in item['time']:
                item['time'] = self.format_time_again(item['time'], item['notify_time'])
            notification = Notification(url=item['url'], title=item['title'], college=item['college'],
                                        speaker=item['speaker'], venue=item['venue'], time=item['time'],
                                        notify_time=item['notify_time'])
            # 存入数据库Notification
            self.session.add(notification)              #sqlalchemy
            self.session.commit()
        except Exception as e:
            print(e)
            self.session.rollback()
            pass
    def reset_data(self, item):
        # url为通知信息主码
        if 'url' not in item:
            print('Invalid item found: %s' % item)

        if 'title' not in item:
            item['title'] = ''

        if 'college' not in item:
            item['college'] = ''

        if 'speaker' not in item:
            item['speaker'] = ''

        if 'venue' not in item:
            item['venue'] = ''

        if 'time' not in item:
            item['time'] = ''

        if item['notify_time'] == '':
            item['notify_time'] = item['time']

    def format_time_again(self,time,notify_time):
        if re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})',time):
            # if re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?',time):
            st=re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})',time)
            y=notify_time.split('-')[0]
            mon=st.group(1)
            d=st.group(2)
            h=st.group(3)
            if '下午' in time:
                if int(h)<12:
                    h+=12
            if re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?(\d{1,2})',time):
                st=re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?(\d{1,2})',time)
                min=st.group(4)
            else:
                min='00'
            report_time=y+'-'+mon+'-'+d+' '+h+':'+min
            return report_time
        elif re.search(r'\D*?(\d{4})年(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})',time):
            # if re.search(r'\D*?(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?',time):
            st=re.search(r'\D*?(\d{4})年(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})',time)
            y=st.group(1)
            mon=st.group(2)
            d=st.group(3)
            h=st.group(4)
            if '下午' in time:
                if int(h)<12:
                    h+=12
            if re.search(r'\D*?(\d{4})年(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?(\d{1,2})',time):
                st=re.search(r'\D*?(\d{4})年(\d{1,2})月(\d{1,2})日\D*?(\d{1,2})\D+?(\d{1,2})',time)
                min=st.group(5)
            else:
                min='00'
            report_time=y+'-'+mon+'-'+d+' '+h+':'+min
            return report_time

    # def format_time_again(self,time,notify_time):
    #     #匹配****年**月**日(下午)HH:MM
    #     if re.search(r'.*?(\d{4}).(\d{1,2}).(\d{1,2}).*?(\d{1,2}).(\d{1,2})\D', time):
    #         st=re.search(r'.*?(\d{4}).(\d+).(\d+).*?(\d{1,2}).+(\d{1,2})\D', time)
    #         y = st.group(1)
    #         mon = st.group(2)
    #         d = st.group(3)
    #         if len(y) == 2:
    #             y = '20' + y
    #         if len(mon) == 1:
    #             mon = '0' + mon
    #         if len(d) == 1:
    #             d = '0' + d
    #         h = st.group(4)
    #         if re.search(r'.*?(\d{4}).(\d+).(\d+).*?下午(\d{1,2}).+(\d{1,2})\D', time):
    #             if int(h)<12:
    #                 h=str(int(h)+12)
    #         min = st.group(5)
    #         if len(h) == 1:
    #             h = '0' + h
    #         if len(min) == 1:
    #             min = '0' + min
    #     # 匹配****年**月**日(下午)HH
    #     elif re.search(r'.*?(\d{4}).(\d+).(\d{1,2}).*?(\d{1,2})\D', time):
    #         st=re.search(r'.*?(\d{4}).(\d+).(\d{1,2}).*?(\d{1,2})\D', time)
    #         y = st.group(1)
    #         mon = st.group(2)
    #         d = st.group(3)
    #         if len(y) == 2:
    #             y = '20' + y
    #         if len(mon) == 1:
    #             mon = '0' + mon
    #         if len(d) == 1:
    #             d = '0' + d
    #         h = st.group(4)
    #         if re.search(r'.*?(\d{4}).(\d+).(\d{1,2}).*?下午(\d{1,2})\D', time):
    #             h=str(int(h)+12)
    #         if len(h) == 1:
    #             h = '0' + h
    #         min='00'
    #     # 匹配**月**日(下午)HH:MM
    #     elif re.search(r'.*?(\d{1,2}).(\d+).*?(\d+).+(\d{1,2})\D',time):
    #         st=re.search(r'.*?(\d{1,2}).(\d+).*?(\d+).+(\d{1,2})\D',time)
    #         y=notify_time.split('-')[0]
    #         mon=st.group(1)
    #         d=st.group(2)
    #         h=st.group(3)
    #         min=st.group(4)
    #         if re.search(r'.*?(\d{1,2}).(\d+).*?下午(\d+).+(\d{1,2})\D',time):
    #             h=str(int(h)+12)
    #         if len(y)==1:
    #             y='0'+y
    #         if len(mon) == 1:
    #             mon = '0' + mon
    #         if len(d) == 1:
    #             d = '0' + d
    #         if len(h) == 1:
    #             h = '0' + h
    #         if len(min) == 1:
    #             min = '0' + min
    #     # 匹配**月**日(下午)HH
    #     elif re.search(r'.*?(\d{1,2}).(\d+).*?(\d{1,2})\D',time):
    #         st=re.search(r'.*?(\d{1,2}).(\d+).*?(\d{1,2})\D',time)
    #         y=notify_time.split('-')[0]
    #         mon=st.group(1)
    #         d=st.group(2)
    #         h=st.group(3)
    #         if re.search(r'.*?(\d{1,2}).(\d+).*?下午(\d{1,2})\D',time):
    #             h=str(int(h)+12)
    #         if len(y)==1:
    #             y='0'+y
    #         if len(mon) == 1:
    #             mon = '0' + mon
    #         if len(d) == 1:
    #             d = '0' + d
    #         if len(h) == 1:
    #             h = '0' + h
    #         min='00'
    #         #(r'.*?(\d+).(\d+)..*?(\d+).+(\d*)')
    #         # (r'.*?(\d+).(\d+)..*?下午(\d+).*?+(\d*)')
    #     elif re.search(r'.*?(\d{4}).(\d{1,2}).(\d{1,2})'):
    #         y = time.group(1)
    #         m = time.group(2)
    #         d = time.group(3)
    #         if len(y) == 2:
    #             y = '20' + y
    #         if len(m) == 1:
    #             m = '0' + m
    #         if len(d) == 1:
    #             d = '0' + d
    #         notice_date = y + '-' + m + "-" + d
    #         # notice_time_ = datetime.strptime(notice_date, '%Y-%m-%d')
    #         return notice_date
    #     else:
    #         y='2000'
    #         mon='01'
    #         d='01'
    #         h='00'
    #         min='00'
    #     report_time_=y+'-'+mon+'-'+d+' '+h+":"+min
    #     # time_=datetime.strptime(report_time_,'%Y-%m-%d %H:%M')
    #     return report_time_
Пример #9
0
 def __init__(self, *a, **kw):
     super().__init__(*a, **kw)
     self.college = '清华大学交叉信息研究院'
     self.db = DBSession()
Пример #10
0
class ThuIiisSpider(CrawlSpider):
    name = 'thu_iiis'
    allowed_domains = ['iiis.tsinghua.edu.cn']
    start_urls = ['https://iiis.tsinghua.edu.cn/list-265-1.html']
    domain_url = 'https://iiis.tsinghua.edu.cn'
    # custom_settings = {
    #     'ITEM_PIPELINES': {'armus1.pipelines.ArmusPipeline': 300}
    # }

    rules = (
        Rule(LinkExtractor(
            allow=r'https://iiis.tsinghua.edu.cn/list-265-\d{1,2}.html'),
             callback='parse_item',
             follow=None),
        # Rule(LinkExtractor(restrict_xpaths='.//ul[@class="pagination"]/li[last()-1]/a'),callback=None,follow=True)
    )

    def __init__(self, *a, **kw):
        super().__init__(*a, **kw)
        self.college = '清华大学交叉信息研究院'
        self.db = DBSession()

    def parse_item(self, response):

        #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get()
        #item['name'] = response.xpath('//div[@id="name"]').get()
        #item['description'] = response.xpath('//div[@id="description"]').get()
        reports = response.xpath('.//tbody//tr')
        for report in reports:
            notice_url = self.domain_url + report.xpath('./td/a/@href').get()
            if self.is_existed_urls(notice_url):
                continue
            else:
                notify_time = ''
                title = report.xpath('./td/a/text()').getall()
                title = ' '.join(''.join(title).split())
                # print(notice_url)
                speaker = report.xpath('./td[2]//text()').getall()
                speaker = ' '.join(''.join(speaker).split())
                time = report.xpath('./td[3]//text()').get()
                # time=self.format_time(time,notify_time)
                venue = report.xpath('./td[4]//text()').get()
                item = Armus_Item(title=title,
                                  speaker=speaker,
                                  venue=venue,
                                  time=time,
                                  url=notice_url,
                                  college=self.college,
                                  notify_time=notify_time)
                yield item
        next_page = response.xpath(
            "//ul[@class='pagination']/li[last()-1]/a/@href").extract_first()
        if next_page is not None:
            next_page = response.urljoin(next_page)
            yield scrapy.Request(next_page, callback=self.parse_item)

    def is_existed_urls(self, notice_url):
        url = self.db.query(
            Notification.url).filter(notice_url == Notification.college).all()
        # existed_urls=[]
        if len(url):
            return True
        else:
            return False

    def format_time(self, time, notify_time):
        #匹配****年**月**日(下午)HH:MM
        if re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+).+(\d+)', time):
            st = re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+).+(\d+)', time)
            y = st.group(1)
            mon = st.group(2)
            d = st.group(3)
            if len(y) == 2:
                y = '20' + y
            if len(mon) == 1:
                mon = '0' + mon
            if len(d) == 1:
                d = '0' + d
            h = st.group(4)
            if re.search(r'.*?(\d{4}).(\d+).(\d+)..*?下午(\d+).+(\d+)', time):
                h = str(int(h) + 12)
            min = st.group(5)
            if len(h) == 1:
                h = '0' + h
            if len(min) == 1:
                min = '0' + min
        # 匹配****年**月**日(下午)HH
        elif re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+)', time):
            st = re.search(r'.*?(\d{4}).(\d+).(\d+)..*?(\d+)', time)
            y = st.group(1)
            mon = st.group(2)
            d = st.group(3)
            if len(y) == 2:
                y = '20' + y
            if len(mon) == 1:
                mon = '0' + mon
            if len(d) == 1:
                d = '0' + d
            h = st.group(4)
            if re.search(r'.*?(\d{4}).(\d+).(\d+)..*?下午(\d+).', time):

                h = str(int(h) + 12)
            if len(h) == 1:
                h = '0' + h
            min = '00'
        # 匹配**月**日(下午)HH:MM
        elif re.search(r'.*?(\d{1,2}).(\d+)..*?(\d+).+(\d*)', time):
            st = re.search(r'.*?(\d{1,2}).(\d+)..*?(\d+).+(\d*)', time)
            y = notify_time.split('-')[0]
            mon = st.group(1)
            d = st.group(2)
            h = st.group(3)
            min = st.group(4)
            if re.search(r'.*?(\d{1,2}).(\d+)..*?下午(\d+).+(\d*)', time):
                h = str(int(h) + 12)
            if len(y) == 1:
                y = '0' + y
            if len(mon) == 1:
                mon = '0' + mon
            if len(d) == 1:
                d = '0' + d
            if len(h) == 1:
                h = '0' + h
            if len(min) == 1:
                min = '0' + min
        # 匹配**月**日(下午)HH
        elif re.search(r'.*?(\d{1,2}).(\d+)..*?(\d+).', time):
            st = re.search(r'.*?(\d{1,2}).(\d+)..*?(\d+).', time)
            y = notify_time.split('-')[0]
            mon = st.group(1)
            d = st.group(2)
            h = st.group(3)
            if re.search(r'.*?(\d{1,2}).(\d+)..*?下午(\d+).', time):
                h = str(int(h) + 12)
            if len(y) == 1:
                y = '0' + y
            if len(mon) == 1:
                mon = '0' + mon
            if len(d) == 1:
                d = '0' + d
            if len(h) == 1:
                h = '0' + h
            min = '00'
            #(r'.*?(\d+).(\d+)..*?(\d+).+(\d*)')
            # (r'.*?(\d+).(\d+)..*?下午(\d+).*?+(\d*)')
        else:
            y = '2000'
            mon = '01'
            d = '01'
            h = '00'
            min = '00'
        report_time_ = y + '-' + mon + '-' + d + ' ' + h + ":" + min
        # time_=datetime.strptime(report_time_,'%Y-%m-%d %H:%M')
        return report_time_
Пример #11
0
class Data_Spider:
    def __init__(self):
        self.process=CrawlerProcess(get_project_settings())
        self.db=DBSession()
        self.init_seed_data()
        #设置默认值
        # self.title_word=str(input('请输入学术讲座通知的匹配关键字:'))
        self.title = '报告题目:,学术报告:,题目,报告主题:,Title'        #(默认值)
        self.speaker = '报告人:,主讲人:,汇报人:,Speaker,报告专家'
        self.venue = '地点:,Address,Venue,Place'
        self.time = '日期:,时间:,Time'
        self.title_word=''

    # 初始化seed表格数据
    def init_seed_data(self):
        init=self.db.query(Seed).all()
        if len(init)==0:
            init_data=Seed()
            init_data.set_init_data(self.db)

    def set_college_url(self,college_url):
        # self.college_url=input('请输入需要爬取的学校的通知网址:')   #start_url
        self.college_url =college_url
    def set_college(self,college):
        self.college=college

    def set_next_xpath(self,next_xpath):
        self.next_xpath=next_xpath

    def set_url_xpath(self,url_xpath):
        self.url_xpath=url_xpath

    def set_text_xpath(self,text_xpath):
        self.text_xpath=text_xpath

    #多个关键词用","隔开
    def set_title_word(self):
        self.title_word=''

    def set_notify_time_xpath(self,notify_time_xpath):
        if len(notify_time_xpath)>0:
            self.notify_time_xpath=notify_time_xpath
        else:
            self.notify_time_xpath=''

    # 关键字设置,现已废除
    # def set_title(self,title):
    #     if len(title)>0:
    #         self.title=self.title+','+title
    #     self.title=self.title.replace(',',',')
    # def set_speaker(self,speaker):
    #     if len(speaker)>0:
    #         self.speaker=self.speaker+','+speaker
    #     self.speaker=self.speaker.replace(',',',')
    # def set_venue(self,venue):
    #     if len(venue)>0:
    #         self.venue=self.venue+','+venue
    #     self.venue = self.venue.replace(',', ',')
    # def set_time(self,time):
    #     if len(time)>0:
    #         self.time=self.time+','+time
    #     self.time = self.time.replace(',', ',')

    # def insert_seed(self,college_url):
    # def insert_seed_test(self):
    #     self.insert_seed()

    def insert_seed(self,db):
        # college_url=str(input('请输入需要爬取的学校的通知网址:'))
        # 设置图形化界面后忽略这一部分
        # self.set_college_url(college_url)
        # college = str(input('请输入需要爬取的学校(学院)的名称:'))
        # self.set_college(college)
        # next_xpath=str(input('请输入通知网站下一页的xpath选择器路径:'))
        # self.set_next_xpath(next_xpath)
        # url_xpath=str(input('请输入通知网站下每个具体网站超链接的xpath路径:'))
        # self.set_url_xpath(url_xpath)
        # text_xpath=str(input('请输入具体通知页面下,爬取通知正文每行文字的xpath路径:'))
        # self.set_text_xpath(text_xpath)
        # notify_time_xpath=str(input('请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入):'))
        # self.set_notify_time_xpath(notify_time_xpath)
        # #上述五条信息必须输入,下面的信息可以选择性输入
        # title_word=str(input('请输入总通知页面下通知标题的字符匹配规则:(可选择不输入)'))
        # self.title_word=title_word
        # title=str(input('请输入报告标题的字符匹配规则:(可选择不输入)'))
        # self.set_title(title)
        # speaker = str(input('请输入报告人的字符匹配规则:(可选择不输入)'))
        # self.set_speaker(speaker)
        # venue = str(input('请输入报告地点的字符匹配规则:(可选择不输入)'))
        # self.set_venue(venue)
        # time = str(input('请输入报告时间的字符匹配规则:(可选择不输入)'))
        # self.set_time(time)
        try:
            seed=Seed(start_url= self.college_url,college= self.college,url_xpath= self.url_xpath,
                         nextpage_xpath= self.next_xpath,title_word= self.title_word,notice_time_xpath= self.notify_time_xpath,
                        # title=self.title, speaker=self.speaker, venue=self.venue, time=self.time,
                         text_xpath= self.text_xpath)
            db.add(seed)
            db.commit()
        except Exception as e:
            print(e)
            db.rollback()
            print('插入数据失败')

    #单个指定学校爬取
    def get_existed_urls(self,seed):
        existed_urls = []
        urls = self.db.query(Notification.url).filter(seed.college == Notification.college).all()
        # existed_urls=[]
        if len(urls)>0:
            for url in urls:
                existed_urls.append(url[0])
        return existed_urls

    #爬取学校学术信息通用流程
    def common_spider(self,seed):
        urlHandle=UrlHandle()
        existed_urls=self.get_existed_urls(seed)
        urlHandle.set_start_url(seed.start_url)
        urlHandle.set_title_word(seed.title_word)
        urlHandle.set_existed_urls(existed_urls)
        urlHandle.set_nextpage_xpath(seed.nextpage_xpath)
        urlHandle.set_url_xpath(seed.url_xpath)
        title_urls=urlHandle.get_filte_urls()
        selenium_spider = SeleniumSpider(seed, title_urls)
        selenium_spider.start_selenium()
        # self.process.crawl(NoticeSpider,seed,title_urls)
        # self.process.start()

    #单个学校学术信息爬取
    def university_spider(self,seed):
        # college_url=self.set_college_url()
        # seed = self.db.query(Seed).filter(Seed.start_url == college_url).one()
        if seed.start_url=='https://iiis.tsinghua.edu.cn/zh/seminars/':    #清华大学
            self.process.crawl(ThuIiisSpider)
            self.process.start()
        else:
            self.common_spider(seed)

    # 所有学校学术信息爬取,一次性爬取所有学校会出错
    def universities_spider(self):
        seeds=self.db.query(Seed).all()
        for seed in seeds:
            #对于每个学校直接调用单个学校爬取函数
            self.university_spider(seed)

    # def start_spider(self):
    #     is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))
    #     while True:
    #         print(is_one_spider)
    #         if is_one_spider in ['y','Y','yes','Yes']:
    #             college_url = str(input('请输入需要爬取的学校的通知网址:'))
    #             seed = self.db.query(Seed).filter(Seed.start_url == college_url).all()
    #             if len(seed)==0:
    #                 seed=self.insert_seed(college_url)
    #                 self.university_spider(seed)
    #             else:
    #                 self.university_spider(seed[0])
    #             is_continue=str(input(('爬取完成,是否继续?y/n')))
    #             if is_continue in ['y','Y','yes','Yes']:
    #                 is_one_spider = str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))
    #             else:
    #                 break
    #         elif is_one_spider in ['n','no','No','N']:
    #             self.universities_spider()
    #             print('所有信息爬取完成!')
    #             break
    #         else:
    #             print('你的输入错误,请重新输入:')
    #             is_one_spider=str(input('爬取一个学校学术信息(y),多个学校学术信息(n)?y/n'))

#放在主程序执行
# spider=Data_Spider()
# spider.start_spider()

#请输入需要爬取的学校的通知网址:http://sist.swjtu.edu.cn/list.do?action=news&navId=40
# 请输入需要爬取的学校(学院)的名称:西南交通大学信息科学与技术学院
# 请输入通知网站下一页的xpath选择器路径://div[@class="tableFootLeft"]//a[text()="下一页"]
# 请输入通知网站下每个具体网站超链接的xpath路径://*[@id="rightPageContent"]/dl//dd
# 请输入具体通知页面下,爬取通知正文每行文字的xpath路径://*[@id="newsBody"]
# 请输入具体通知页面下,爬取通知时间的xpath路径,默认为空(不存在就不必输入)://*[@id="newsInfo"]

# http://cs.gzu.edu.cn/forum.php?mod=forumdisplay&fid=57&page=1
# 贵州大学计算机科学与技术学院
# url_xpath=//*[@id="newsList"]//p
# nextpage=//*[@id="bmbw0pgscl"]/div//a[text()='下一页']
# notify_time=//*[@id="ct"]/div[1]/div/div[1]/p
# 通知全文=//td[@class="t_f"]
Пример #12
0
class Notification(Base):
    """the class map to table of db_model.notifications"""

    __tablename__ = 'notifications'

    url = Column(String(255), primary_key=True)  #通知全文的url
    title = Column(String(255))  #讲座题目
    college = Column(String(255))  #讲座所在大学
    speaker = Column(String(255))  #讲座演讲人
    venue = Column(String(255))  #讲座地点
    time = Column(String(255))  #讲座时间
    notify_time = Column(String(20))  #通知发布时间

    # def __init__(self):                       scrapy框架+selenium
    #     self.session = DBSession()
    #     Base.metadata.create_all(engine)
    #     self.info_byrelease=[] #按照通知时间
    #     self.info_bytime=[]    #按照讲座时间

    def orderbyrelease(self):  # 按照发布时间排序,并且只选中近一年的数据
        # session = DBSession()
        self.session = DBSession()
        self.info_byrelease = []
        # temp = self.session.query(Notification).filter(
        #     Notification.notify_time >= datetime.datetime.now() - timedelta(days=365)).order_by(
        #     desc(Notification.notify_time)).all()
        temp = self.session.query(Notification).filter(
            and_(
                Notification.notify_time >=
                datetime.datetime.now() - timedelta(days=365),  # 时间
                or_(Notification.title.like("%密码学%"),
                    Notification.title.like("%信息安全%"),
                    Notification.title.like("%security%"),
                    Notification.title.like("%password%"))  # 筛选信息
            )).order_by(desc(Notification.notify_time)).all()
        print("按照通知发布时间由近及远排序:")
        for t in temp:
            t_dict = t.__dict__
            info = {
                'title': t_dict['title'],
                'speaker': t_dict['speaker'],
                'time': t_dict['time'],
                'venue': t_dict['venue'],
                'college': t_dict['college'],
                'url': t_dict['url'],
                'notiify_time': t_dict['notiify_time']
            }
            self.info_byrelease.append(info)
        print(self.info_byrelease)
        # print("讲座标题:", t_dict['title'])
        # print("报告人:", t_dict['speaker'])
        # print("时间:", t_dict['time'])
        # print("地点:", t_dict['venue'])
        # print("大学:", t_dict['college'])
        # print("通知全文链接:", t_dict['url'])
        # print()

    def orderbytime(self):  # 按照举行时间排序,并且只选中近一年的数据
        self.session = DBSession()
        self.info_bytime = []
        # temp = self.session.query(Notification).filter(
        #     Notification.notify_time >= datetime.datetime.now() - timedelta(days=365)).order_by(
        #     desc(Notification.time)).all()
        temp = self.session.query(Notification).filter(
            and_(
                Notification.time >=
                datetime.datetime.now() - timedelta(days=365),  # 时间
                or_(Notification.title.like("%密码学%"),
                    Notification.title.like("%信息安全%"),
                    Notification.title.like("%security%"),
                    Notification.title.like("%password%"))  # 筛选信息
            )).order_by(desc(Notification.notify_time)).all()
        print("按照报告举行时间由近及远排序:")
        for t in temp:
            t_dict = t.__dict__
            info = {
                'title': t_dict['title'],
                'speaker': t_dict['speaker'],
                'time': t_dict['time'],
                'venue': t_dict['venue'],
                'college': t_dict['college'],
                'url': t_dict['url'],
                'notiify_time': t_dict['notiify_time']
            }
            self.info_bytime.append(info)
        # print(self.info_bytime)
        # print("讲座标题:", t_dict['title'])
        # print("报告人:", t_dict['speaker'])
        # print("时间:", t_dict['time'])
        # print("地点:", t_dict['venue'])
        # print("大学:", t_dict['college'])
        # print("通知全文链接:", t_dict['url'])
        # print()
    def get_info_bytime(self):  #获取按讲座时间排序的讲座信息
        return self.info_bytime

    def get_info_byrelease(self):  #获取按通知时间排序的讲座信息
        return self.info_byrelease
Пример #13
0
from db_model.seeds import Seed
from db_model.db_config import DBSession
from db_model.notifications import Notification
from db_model.db_config import Seed
from db_model.db_config import Notification
from UrlHandle import UrlHandle

from armus1.spiders.notice import NoticeSpider
from armus1.spiders.thu_iiis import ThuIiisSpider
# scrapy api
from scrapy.utils.project import get_project_settings
from scrapy.crawler import CrawlerProcess

process = CrawlerProcess(get_project_settings())

db = DBSession()
#scut_se=Seed(start_url='http://www2.scut.edu.cn/sse/xshd/list.htm',college='华南理工大学软件学院',
#url_xpath='.//*[@class="news_ul"]//li',
#nextpage_xpath='//*[@id="wp_paging_w67"]/ul/li[2]/a[3]',
#title_word='举办,举行',
#notice_time_xpath='//*[@id="page-content-wrapper"]/div[2]/div/div/div[2]/div/div/div/p/span[1]',
#title='汇报主题:,报告题目:,题目:,Title:,报告主题:',speaker='汇报人:,报告人:,Speaker',
#venue='地点:,venue:,Address:',time='Time:,时间:',
#text_xpath='//*[@id="page-content-wrapper"]/div[2]/div/div/div[2]/div/div/div/div[2]/div/div//p')

#jnu_xx=Seed(start_url='https://xxxy2016.jnu.edu.cn/Category_37/Index.aspx',
#college='暨南大学信息科学技术学院/网络空间安全学院',
#url_xpath='//*[@id="mainContent"]/div[2]/ul//li',
#nextpage_xpath='//*[@id="pe100_page_通用信息列表_普通式"]/div/a[9]',
#title_word='学术讲座',
#notice_time_xpath='//*[@id="mainContent"]/div[2]/div/div[1]/span[3]',