예제 #1
0
 def parse(self, response):
     # Parses each scraped section into its own Meilisearch document.
     hxs = Selector(response)
     if response.url in self.seen:
         self.log('already seen  %s' % response.url)
     else:
         self.log('parsing  %s' % response.url)
         self.seen.add(response.url)
     for url in hxs.xpath('//a/@href').extract():
         url = urljoin(response.url, url)
         if not url in self.seen and not re.search(r'.(pdf|zip|png|gif|jpeg)$', url) and url.startswith( self.start_urls[0] ):
             self.log("yielding request " + url)
             yield Request(url, callback=self.parse)
     item = DocsItem()
     item['site'] = self.name
     if hxs.xpath('//title/text()').get() != None:
         item['title'] = hxs.xpath('//title/text()').get().split(" - ")[0]
         if len(hxs.xpath('//title/text()').get().split(" - ")) >= 2:
             item['section'] = hxs.xpath('//title/text()').get().split(" - ")[1]
         if len(hxs.xpath('//title/text()').get().split(" - ")) == 1:
             item['section'] = hxs.xpath('//title/text()').get().split(" - ")[0]
     else:
         item['title'] = "Platform.sh Community"
         item['section'] = "Platform.sh Community"
     item['url'] = response.url
     # Every document in Meilisearch needs a unique documentID.
     item['documentId'] = hashlib.sha1(str(response.url).encode('utf-8')).hexdigest()
     item['text'] = re.sub(r'<.*?>', '', ' '.join(hxs.css('.crawler').extract()))
     item['rank'] = self.rank
     item['subsections'] = item['section']
     yield item
예제 #2
0
 def parse(self, response):
     hxs = Selector(response)
     if response.url in self.seen:
         self.log('already seen  %s' % response.url)
     else:
         self.log('parsing  %s' % response.url)
         self.seen.add(response.url)
     for url in hxs.xpath('//a/@href').extract():
         url = urljoin(response.url, url)
         if not url in self.seen and not re.search(r'.(pdf|zip)$', url) and url.startswith( self.start_urls[0] ):
             self.log("yielding request " + url)
             yield Request(url, callback=self.parse)
     item = DocsItem()
     item['site'] = self.name
     if len(response.url.split("/")) >= 5:
         item['section'] = self.get_section(response.url.split("/")[3], response.url.split("/")[4])
     else:
         item['section'] = self.default_section
     if item['section'] == "Demos":
         item['title'] = hxs.xpath('.//h1/text()').get()
     else:
         item['title'] = hxs.xpath('//title/text()').get().replace('| Platform.sh', '')
     item['url'] = response.url
     item['documentId'] = hashlib.sha1(str(response.url).encode('utf-8')).hexdigest()
     item['text'] =  " ".join(response.xpath('.//div[contains(@class,"container-fluid") and not(contains(@class,"footer"))]//text()').getall())
     item['rank'] = self.rank
     item['subsections'] = item['section']
     yield item
예제 #3
0
    def pares_data(self, response):
        item = DocsItem()

        item['length'] = str(len(requests.get(url=response.url).content))

        item['url'] = response.url
        item['description'] = response.xpath(
            '//section[@class="section"]/h1/text()').extract()[0]
        item['title'] = response.xpath('//title/text()').extract()[0]
        item['texts'] = html2text.html2text(
            response.xpath('//section[@class="section"]').extract()[0])
        item['data'] = html2text.html2text(
            response.xpath('//section[@class="section"]').extract()[0])
        if re.findall('\*\*此内容.*?\*\*', item['data']):
            item['data'] = re.split('\*\*此内容.*?\*\*\s', item['data'])[1]
        if re.findall('>', item['data']):
            item['data'] = re.sub('>\s', "", item['data'])
            item['data'] = re.sub('>', "", item['data'])
        if re.findall('<', item['data']):
            item['data'] = re.sub('<\s', "", item['data'])
            item['data'] = re.sub('<', "", item['data'])
        if re.findall('^#{1}?', item['data']):
            item['data'] = re.sub('^#{1}? \w+.*', "", item['data'])
        if re.findall('\s#{1}?', item['data']):
            item['data'] = re.sub('\s#{1}? \w+.*', "", item['data'])
        if re.findall('##', item['data']):
            item['data'] = re.sub('##\s', "", item['data'])
        if re.findall('#', item['data']):
            item['data'] = re.sub('#', "", item['data'])
        if re.findall('\*\*', item['data']):
            item['data'] = re.sub('\*\*\s', "", item['data'])
            item['data'] = re.sub('\*\*', "", item['data'])
        if re.findall('\*', item['data']):
            item['data'] = re.sub('\*', "", item['data'])
            item['data'] = re.sub('\*\s', "", item['data'])
        if re.findall('预计阅读时间: .* 分钟.*', item['data']):
            item['data'] = re.sub('.* 分钟.*\s', "", item['data'])
        if re.findall('^#{2,}', item['data']):
            item['data'] = re.sub('^#{2,} \w*', "", item['data'])
        if re.findall('\[.*\]\(.*\)', item['data']):
            item['data'] = re.sub('\[.*\]\(.*\)\s', "", item['data'])
            item['data'] = re.sub('\[.*\]\(.*\)', "", item['data'])
        if re.findall('\n', item['data']):
            item['data'] = re.sub('\n', "", item['data'])

        item['data'] = item['data'][:150] + "..."
        return item
예제 #4
0
 def parse(self, response):
     sections = response.xpath('.//div[contains(@class,"api-content")]/div')
     for index, section in enumerate(sections):
         title = section.xpath('.//h2/text()').get()
         url = section.xpath('.//h2/a/@href').get()
         text = section.xpath('.//p/text()').getall()
         if title is not None and url is not None and text is not None:
             endpoint_section = self.get_section(url)
             if endpoint_section is not None:
                 item = DocsItem()
                 item['site'] = self.name
                 item['title'] = title
                 item['url'] = self.start_urls[0] + url
                 item['section'] = endpoint_section
                 item['documentId'] = hashlib.sha1(
                     str(item['url']).encode('utf-8')).hexdigest()
                 item['text'] = " ".join(text)
                 item['rank'] = self.rank
                 item['subsections'] = endpoint_section
                 yield item