def worker(url, fileArticle, tool): """get all travelogues""" global share_q while not share_q.empty(): url = share_q.get() pageCode = openurl(url) soup = BeautifulSoup(pageCode) # tags:所给地址页面代码中与所有文章对应的源代码 tags = soup.findAll('a', {'class': 'journal-item cf'}) # 获取题目和链接 for eachTag in tags: global num articleTitle = getArticleName(eachTag) articleLink = getArticleHtml(eachTag) # 保存到文件中 fileArticle.write('\n' + articleTitle + " " + articleLink + ' page = ' + str(page) + '\n') # 爬取文章 articlePageCode = openurl(articleLink) soup = BeautifulSoup(articlePageCode) # tags:具体文章页面对应的源代码 tags = soup.findAll('div', {'class': 'ctd_content'}) articleHtml = str(tags[0]) articleHTML = '<html><meta charset="utf-8">' + articleHtml # 不加这一句在浏览器中的显示是乱码 # 提取为纯文本 articleContents = tool.replace(articleHTML) os.chdir(traveloguePath) num += 1 articleFile = open(str(num) + '.txt', 'w+') articleFile.write(articleContents) share_q.task_done()
def worker(url, fileArticle,tool): """get all travelogues""" global share_q while not share_q.empty(): url = share_q.get() pageCode = openurl(url) soup = BeautifulSoup(pageCode) # tags:所给地址页面代码中与所有文章对应的源代码 tags = soup.findAll('a', {'class': 'journal-item cf'}) # 获取题目和链接 for eachTag in tags: global num articleTitle = getArticleName(eachTag) articleLink = getArticleHtml(eachTag) # 保存到文件中 fileArticle.write('\n' + articleTitle + " " + articleLink + ' page = ' + str(page) + '\n') # 爬取文章 articlePageCode = openurl(articleLink) soup = BeautifulSoup(articlePageCode) # tags:具体文章页面对应的源代码 tags = soup.findAll('div', {'class': 'ctd_content'}) articleHtml = str(tags[0]) articleHTML = '<html><meta charset="utf-8">' + articleHtml # 不加这一句在浏览器中的显示是乱码 # 提取为纯文本 articleContents = tool.replace(articleHTML) os.chdir(traveloguePath) num += 1 articleFile = open(str(num) + '.txt', 'w+') articleFile.write(articleContents) share_q.task_done()
def parse_content(self, response): content = response.xpath('//div[@id="problem-detail"]/div') item = response.meta['item'] tags = response.xpath('//div[@id="problem-detail"]//a[@class="label bg-success"]/text()').extract() if tags and "LintCodde" in tags[-1]: tags.pop() related = response.xpath('//span[@class="m-l-sm title"]/text()').extract() for i in xrange(len(related)): related[i] = replace(related[i]).strip().replace(' ','_') item['tags'] = tags item['related'] = related des = content[2].xpath('p').extract()[0] # des = replace(des) item["content"] = list() if des: item["content"].append(des) return item
def parse_content(self, response): sel = Selector(response) content = sel.xpath("//div[@class='question-content']/p") tags = sel.xpath("//div[@class='question-content']//div[@id='tags']/following-sibling::span/a/text()").extract() related = sel.xpath("//div[@class='question-content']//div[@id='similar']/following-sibling::span/a/text()").extract() item = response.meta['item'] for i in xrange(len(related)): related[i] = replace(related[i]).strip().replace(' ','_') item['content'] = [] item['tags'] = tags item['related'] = related for des in content: text = des.extract() if 'Credits' in text or 'style' in text or 'show hint' in text: continue if text: # text = replace(text) if text: item['content'].append(text) # response.meta['content'] = item['content'] return item