Exemplo n.º 1
0
def worker(url, fileArticle, tool):
    """get all travelogues"""
    global share_q
    while not share_q.empty():
        url = share_q.get()
        pageCode = openurl(url)
        soup = BeautifulSoup(pageCode)
        # tags:所给地址页面代码中与所有文章对应的源代码
        tags = soup.findAll('a', {'class': 'journal-item cf'})
        # 获取题目和链接
        for eachTag in tags:
            global num
            articleTitle = getArticleName(eachTag)
            articleLink = getArticleHtml(eachTag)
            # 保存到文件中
            fileArticle.write('\n' + articleTitle + "         " + articleLink +
                              ' page = ' + str(page) + '\n')
            # 爬取文章
            articlePageCode = openurl(articleLink)
            soup = BeautifulSoup(articlePageCode)
            # tags:具体文章页面对应的源代码
            tags = soup.findAll('div', {'class': 'ctd_content'})
            articleHtml = str(tags[0])
            articleHTML = '<html><meta charset="utf-8">' + articleHtml  # 不加这一句在浏览器中的显示是乱码
            # 提取为纯文本
            articleContents = tool.replace(articleHTML)
            os.chdir(traveloguePath)
            num += 1
            articleFile = open(str(num) + '.txt', 'w+')
            articleFile.write(articleContents)
        share_q.task_done()
Exemplo n.º 2
0
def worker(url, fileArticle,tool):
    """get all travelogues"""
    global share_q
    while not share_q.empty():
        url = share_q.get()
        pageCode = openurl(url)
        soup = BeautifulSoup(pageCode)
        # tags:所给地址页面代码中与所有文章对应的源代码
        tags = soup.findAll('a', {'class': 'journal-item cf'})
        # 获取题目和链接
        for eachTag in tags:
            global num
            articleTitle = getArticleName(eachTag)
            articleLink = getArticleHtml(eachTag)
            # 保存到文件中
            fileArticle.write('\n' + articleTitle + "         " + articleLink + ' page = ' + str(page) + '\n')
            # 爬取文章
            articlePageCode = openurl(articleLink)
            soup = BeautifulSoup(articlePageCode)
            # tags:具体文章页面对应的源代码
            tags = soup.findAll('div', {'class': 'ctd_content'})
            articleHtml = str(tags[0])
            articleHTML = '<html><meta charset="utf-8">' + articleHtml  # 不加这一句在浏览器中的显示是乱码
            # 提取为纯文本
            articleContents = tool.replace(articleHTML)
            os.chdir(traveloguePath)
            num += 1
            articleFile = open(str(num) + '.txt', 'w+')
            articleFile.write(articleContents)
        share_q.task_done()
Exemplo n.º 3
0
 def parse_content(self, response):
     content = response.xpath('//div[@id="problem-detail"]/div')
     item = response.meta['item']
     tags = response.xpath('//div[@id="problem-detail"]//a[@class="label bg-success"]/text()').extract()
     if tags and "LintCodde" in tags[-1]:
         tags.pop()
     related = response.xpath('//span[@class="m-l-sm title"]/text()').extract()
     for i in xrange(len(related)):
         related[i] = replace(related[i]).strip().replace(' ','_')
     item['tags'] = tags
     item['related'] = related
     des = content[2].xpath('p').extract()[0]
     # des = replace(des)
     item["content"] = list()
     if des:
         item["content"].append(des)
     return item
Exemplo n.º 4
0
    def parse_content(self, response):
        sel = Selector(response)
        content = sel.xpath("//div[@class='question-content']/p")
        tags = sel.xpath("//div[@class='question-content']//div[@id='tags']/following-sibling::span/a/text()").extract()
        related = sel.xpath("//div[@class='question-content']//div[@id='similar']/following-sibling::span/a/text()").extract()
        item = response.meta['item']
        
        for i in xrange(len(related)):
            related[i] = replace(related[i]).strip().replace(' ','_')
            
        item['content'] = []
        item['tags'] = tags
        item['related'] = related

        for des in content:
            text = des.extract()
            if 'Credits' in text or 'style' in text or 'show hint' in text:
                continue
            if text:
                # text = replace(text)
                if text:
                    item['content'].append(text)
        # response.meta['content'] = item['content']
        return item