示例#1
0
 def parse(self, response):
     # 得到锚文本
     teacherItems = response.xpath(
         '//table[@class ="wp_article_list_table"]')
     # 获取每位老师具体介绍页面链接锚文本
     nexturls = teacherItems.xpath('.//span[@class="Article_MicroImage"]')
     # 输出链接数据
     file = open('../docs/%s/index.txt' % self.name, 'a', encoding='utf-8')
     for urlt in nexturls:
         print(urlt.get())
         file.write(
             urlt.xpath('.//a/@title').get() + "," +
             urlt.xpath(".//a/@href").get() + "," + "南开大学文学院" + "," +
             response.url + '\n')
         # 保存锚文本
         m_f = open('../docs/%s/m_text/%s_m.txt' %
                    (self.name, urlt.xpath('.//a/@title').get()),
                    'w',
                    encoding='utf-8')
         m_f.write(str(urlt.get()))
         m_f.close()
         #递归回调函数保存图片
         item = TeacherinfoItem()
         item['image_name'] = urlt.xpath('.//a/@title').get()
         item['image_url'] = self.baseurl + urlt.xpath('.//img/@src').get()
         print(item['image_name'], item['image_url'])
         request = scrapy.Request(url=item['image_url'],
                                  callback=self.parseImg)
         request.meta['item'] = item
         yield request
         # 递归回调解析教师信息的解析器
         yield scrapy.Request(url=urlt.xpath(".//a/@href").get(),
                              callback=self.parseTeacher)
     file.close()
示例#2
0
    def parseTeacher(self, response):
        #/html/body/div[3]/div/div/div
        data = response.meta['item']

        # 保存网页的主体内容
        details = response.xpath('//div[@portletmode="simpleArticleAttri"]')
        filename = str(
            details.xpath('.//div[@class="name"]/text()').get()).replace(
                '\n', '').replace(' ', '').replace('\r', '')
        f = open('../docs/%s/%s.txt' % (self.name, filename),
                 'w',
                 encoding='utf-8')  # 保存个人主页信息文件
        f.write(filename + '\n')
        for item in details.xpath('.//div[@class = "lxfs-info"]').xpath(
                './/div[@class="info"]'):
            #print(item)
            for text in item.xpath('.//text()').getall():
                f.write(
                    str(text).replace('\n', '').replace(' ',
                                                        '').replace('\r', ''))
                f.write('\n')
        for item in details.xpath(
                './/div[@class="layui-tab layui-tab-brief"]'):
            #print(item)
            for text in item.xpath('.//text()').getall():
                f.write(
                    str(text).replace('\n', '').replace(' ',
                                                        '').replace('\r', ''))
                f.write('\n')
        f.close()

        # 存儲教师姓名和网址映射信息
        file = open('../docs/%s/index.txt' % self.name, 'a', encoding='utf-8')
        # 存储基本信息索引,姓名,网页链接,学院,父链接
        file.write(filename + "," + response.url + ',' + data["xueyuan"] +
                   "," + data['parentUrl'] + '\n')
        file.close()

        # 保存锚文本
        m_f = open('../docs/%s/m_text/%s_m.txt' % (self.name, filename),
                   'w',
                   encoding='utf-8')
        m_f.write(str(data["m_text"]))
        m_f.close()
        # 保存网页快照

        with open('%s/%s/%s.html' % (snapshots_path, self.name, filename),
                  'wb') as s_f:
            s_f.write(response.body)

        # 递归回调函数保存图片
        imgurl = details.xpath('.//img/@src').get()
        item = TeacherinfoItem()
        item['image_name'] = filename
        item['image_url'] = self.baseurl + imgurl
        request = scrapy.Request(url=item['image_url'], callback=self.parseImg)
        request.meta['item'] = item
        yield request
示例#3
0
    def parseTeacher2(self, response):
        # /html/body/div[3]/div/div/div
        details = response.xpath('.//div[@class="jz_li_div"]')
        for temp in details:
            filename = temp.xpath('.//h3/text()').get()
            # 保存网页快照
            with open('%s/%s/%s.html' % (snapshots_path, self.name, filename),
                      'wb') as s_f:
                s_f.write(response.body)

            print(filename)
            f = open('../docs/%s/%s.txt' % (self.name, filename),
                     'w',
                     encoding='utf-8')
            f.write(filename + '\n')
            imgurl = temp.xpath('.//img/@src').get()
            temp = temp.xpath('.//div[@class="jz_li_content"]')
            for item in temp.css('p'):
                print(item)
                for text in item.xpath('.//text()').getall():
                    f.write(
                        str(text).replace('\n',
                                          '').replace(' ',
                                                      '').replace('\r', ''))
                f.write('\n')
            f.close()
            # 存儲映射信息
            file = open('../docs/%s/index.txt' % self.name,
                        'a',
                        encoding='utf-8')
            file.write(filename + "," + response.url + "," + "南开大学经济学院" + "," +
                       response.url + '\n')
            file.close()

            # 保存锚文本
            m_f = open('../docs/%s/m_text/%s_m.txt' % (self.name, filename),
                       'w',
                       encoding='utf-8')
            m_f.write(str(temp.get()))
            m_f.close()
            # 递归回调函数保存图片
            if imgurl is None:
                continue
            print(imgurl)
            item = TeacherinfoItem()
            item['image_name'] = filename
            item['image_url'] = self.baseurl + imgurl
            # print(item['image_name'], item['image_url'])
            request = scrapy.Request(url=item['image_url'],
                                     callback=self.parseImg)
            request.meta['item'] = item
            yield request
示例#4
0
    def parseTeacher(self, response):
        # /html/body/div[3]/div/div/div
        data = response.meta['item']
        details = response.xpath('.//div[@frag="面板21"]')
        filename = details.xpath('.//table[1]/tr[1]/td/text()').get()
        print(filename)
        f = open('../docs/%s/%s.txt' % (self.name, filename),
                 'w',
                 encoding='utf-8')
        f.write(filename + '\n')
        details = details.xpath('.//table[4]').xpath(".//table")
        for item in details.css('p'):
            print(item)
            for text in item.xpath('.//text()').getall():
                f.write(
                    str(text).replace('\n', '').replace(' ',
                                                        '').replace('\r', ''))
            f.write('\n')
        f.close()
        # 存儲映射信息
        file = open('../docs/%s/index.txt' % self.name, 'a', encoding='utf-8')
        file.write(filename + "," + response.url + ',' + data["xueyuan"] +
                   "," + data['parentUrl'] + '\n')
        file.close()
        # 保存锚文本
        m_f = open('../docs/%s/m_text/%s_m.txt' % (self.name, filename),
                   'w',
                   encoding='utf-8')
        m_f.write(str(data["m_text"]))
        m_f.close()

        # 保存网页快照
        with open('%s/%s/%s.html' % (snapshots_path, self.name, filename),
                  'wb') as s_f:
            s_f.write(response.body)

        # 递归回调函数保存图片
        imgurl = details.xpath('.//td[@class="MsoNormal STYLE1"]').xpath(
            './/img/@src').get()
        print(imgurl)
        item = TeacherinfoItem()
        item['image_name'] = filename
        item['image_url'] = self.baseurl + imgurl
        # print(item['image_name'], item['image_url'])
        request = scrapy.Request(url=item['image_url'], callback=self.parseImg)
        request.meta['item'] = item
        yield request
示例#5
0
    def parse(self, response):
        # 存放老师信息的集合
        items = []
        print(response.body)
        for each in response.xpath("//div[@class='li_txt']"):
            # 将我们得到的数据封装到一个 `ItcastItem` 对象
            item = TeacherinfoItem()
            # extract()方法返回的都是unicode字符串
            name = each.xpath("h3/text()").extract()
            position = each.xpath("h4/text()").extract()
            info = each.xpath("p/text()").extract()

            # xpath返回的是包含一个元素的列表
            item['name'] = name[0]
            item['position'] = position[0]
            item['info'] = info[0]

            items.append(item)
            yield item
示例#6
0
    def parseTeacher(self, response):
        #/html/body/div[3]/div/div/div
        data = response.meta['item']

        details = response.xpath(
            '//div[@class="leader-list leader cl"]/div[@class="wp_articlecontent"]'
        )
        splitwords = re.findall(
            r'[\u4e00-\u9fa5]*|[a-zA-Z]*',
            str(
                details.xpath(
                    './/div[@class = "info"]/div[@class="name"]/text()').get())
        )
        filename = ''
        for p in [x for x in splitwords if len(x) >= 1]:
            filename = filename + p
        f = open('../docs/%s/%s.txt' % (self.name, filename),
                 'w',
                 encoding='utf-8')
        f.write(filename + '\n')
        for item in details.xpath('.//div[@class = "info"]').xpath(
                './/div[@class="label-value"]'):
            print(item)
            for text in item.xpath('.//text()').getall():
                f.write(
                    str(text).replace('\n', '').replace(' ',
                                                        '').replace('\r', ''))
            f.write('\n')
        for item in details.xpath('.//div[@id="tabsDiv"]').css('p'):
            print(item)
            for text in item.xpath('.//text()').getall():
                f.write(
                    str(text).replace('\n', '').replace(' ',
                                                        '').replace('\r', ''))
            f.write('\n')
        f.close()
        # 存儲映射信息
        file = open('../docs/%s/index.txt' % self.name, 'a', encoding='utf-8')
        file.write(filename + "," + response.url + ',' + data["xueyuan"] +
                   "," + data['parentUrl'] + '\n')
        file.close()

        # 保存网页快照
        with open('%s/%s/%s.html' % (snapshots_path, self.name, filename),
                  'wb') as s_f:
            s_f.write(response.body)

        # 保存锚文本
        m_f = open('../docs/%s/m_text/%s_m.txt' % (self.name, filename),
                   'w',
                   encoding='utf-8')
        m_f.write(str(data["m_text"]))
        m_f.close()

        # 递归回调函数保存图片
        imgurl = details.xpath('.//img/@src').get()
        item = TeacherinfoItem()
        item['image_name'] = filename
        item['image_url'] = self.baseurl + imgurl
        #print(item['image_name'], item['image_url'])
        request = scrapy.Request(url=item['image_url'], callback=self.parseImg)
        request.meta['item'] = item
        yield request